[v3] Add support for IBM Z s390x

Message ID 20230726013517.216549-1-dmiller423@gmail.com (mailing list archive)
State Changes Requested, archived
Delegated to: David Marchand
Headers
Series [v3] Add support for IBM Z s390x |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/github-robot: build fail github build: failed
ci/intel-Functional success Functional PASS
ci/iol-abi-testing warning Testing issues
ci/iol-aarch-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing fail Testing issues
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-unit-testing success Testing PASS
ci/iol-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS

Commit Message

David Miller July 26, 2023, 1:35 a.m. UTC
  Minimal changes to drivers and app to support the IBM s390x.

Signed-off-by: David Miller <dmiller423@gmail.com>
Reviewed-by: Mathew S Thoennes <tardis@us.ibm.com>
---
 app/test-acl/main.c                          |   4 +
 app/test/test_acl.c                          |   1 +
 app/test/test_atomic.c                       |   7 +-
 app/test/test_cmdline_ipaddr.c               |  12 +-
 app/test/test_cmdline_num.c                  | 110 ++++
 app/test/test_hash_functions.c               |  29 +
 app/test/test_xmmt_ops.h                     |  14 +
 buildtools/pmdinfogen.py                     |  11 +-
 config/meson.build                           |   2 +
 config/s390x/meson.build                     |  51 ++
 config/s390x/s390x_linux_clang_ubuntu        |  19 +
 doc/guides/nics/features/i40e.ini            |   1 +
 drivers/common/mlx5/mlx5_common.h            |   9 +
 drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
 drivers/net/i40e/meson.build                 |   2 +
 drivers/net/ixgbe/ixgbe_rxtx.c               |   8 +-
 drivers/net/memif/rte_eth_memif.h            |   2 +
 drivers/net/mlx5/mlx5_rx.c                   |  24 +-
 drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
 examples/l3fwd/l3fwd_em.c                    |   8 +
 examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
 examples/l3fwd/l3fwd_s390x.h                 | 261 ++++++++
 lib/acl/acl_bld.c                            |   3 +
 lib/acl/acl_gen.c                            |   9 +
 lib/acl/acl_run_scalar.c                     |   8 +
 lib/acl/rte_acl.c                            |  27 +
 lib/acl/rte_acl.h                            |   5 +-
 lib/eal/s390x/include/meson.build            |  16 +
 lib/eal/s390x/include/rte_atomic.h           |  44 ++
 lib/eal/s390x/include/rte_byteorder.h        |  43 ++
 lib/eal/s390x/include/rte_cpuflags.h         |  41 ++
 lib/eal/s390x/include/rte_cycles.h           |  44 ++
 lib/eal/s390x/include/rte_io.h               | 184 ++++++
 lib/eal/s390x/include/rte_mcslock.h          |  18 +
 lib/eal/s390x/include/rte_memcpy.h           |  55 ++
 lib/eal/s390x/include/rte_pause.h            |  22 +
 lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
 lib/eal/s390x/include/rte_prefetch.h         |  46 ++
 lib/eal/s390x/include/rte_rwlock.h           |  42 ++
 lib/eal/s390x/include/rte_spinlock.h         |  85 +++
 lib/eal/s390x/include/rte_ticketlock.h       |  18 +
 lib/eal/s390x/include/rte_vect.h             |  35 ++
 lib/eal/s390x/meson.build                    |  16 +
 lib/eal/s390x/rte_cpuflags.c                 |  91 +++
 lib/eal/s390x/rte_cycles.c                   |  11 +
 lib/eal/s390x/rte_hypervisor.c               |  11 +
 lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
 lib/hash/rte_fbk_hash.h                      |   7 +
 lib/lpm/meson.build                          |   1 +
 lib/lpm/rte_lpm.h                            |   2 +
 lib/lpm/rte_lpm6.c                           |  18 +
 lib/lpm/rte_lpm_s390x.h                      | 130 ++++
 meson.build                                  |   2 +
 53 files changed, 2439 insertions(+), 14 deletions(-)
 create mode 100644 config/s390x/meson.build
 create mode 100644 config/s390x/s390x_linux_clang_ubuntu
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_s390x.c
 create mode 100644 examples/l3fwd/l3fwd_lpm_s390x.h
 create mode 100644 examples/l3fwd/l3fwd_s390x.h
 create mode 100644 lib/eal/s390x/include/meson.build
 create mode 100644 lib/eal/s390x/include/rte_atomic.h
 create mode 100644 lib/eal/s390x/include/rte_byteorder.h
 create mode 100644 lib/eal/s390x/include/rte_cpuflags.h
 create mode 100644 lib/eal/s390x/include/rte_cycles.h
 create mode 100644 lib/eal/s390x/include/rte_io.h
 create mode 100644 lib/eal/s390x/include/rte_mcslock.h
 create mode 100644 lib/eal/s390x/include/rte_memcpy.h
 create mode 100644 lib/eal/s390x/include/rte_pause.h
 create mode 100644 lib/eal/s390x/include/rte_power_intrinsics.h
 create mode 100644 lib/eal/s390x/include/rte_prefetch.h
 create mode 100644 lib/eal/s390x/include/rte_rwlock.h
 create mode 100644 lib/eal/s390x/include/rte_spinlock.h
 create mode 100644 lib/eal/s390x/include/rte_ticketlock.h
 create mode 100644 lib/eal/s390x/include/rte_vect.h
 create mode 100644 lib/eal/s390x/meson.build
 create mode 100644 lib/eal/s390x/rte_cpuflags.c
 create mode 100644 lib/eal/s390x/rte_cycles.c
 create mode 100644 lib/eal/s390x/rte_hypervisor.c
 create mode 100644 lib/eal/s390x/rte_power_intrinsics.c
 create mode 100644 lib/lpm/rte_lpm_s390x.h
  

Comments

David Marchand Aug. 2, 2023, 3:25 p.m. UTC | #1
Hello David,

On Wed, Jul 26, 2023 at 3:35 AM David Miller <dmiller423@gmail.com> wrote:
>
> Minimal changes to drivers and app to support the IBM s390x.

This seems a bit more than "minimal changes" :-).

>
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Reviewed-by: Mathew S Thoennes <tardis@us.ibm.com>
> ---
>  app/test-acl/main.c                          |   4 +
>  app/test/test_acl.c                          |   1 +
>  app/test/test_atomic.c                       |   7 +-
>  app/test/test_cmdline_ipaddr.c               |  12 +-
>  app/test/test_cmdline_num.c                  | 110 ++++
>  app/test/test_hash_functions.c               |  29 +
>  app/test/test_xmmt_ops.h                     |  14 +
>  buildtools/pmdinfogen.py                     |  11 +-
>  config/meson.build                           |   2 +
>  config/s390x/meson.build                     |  51 ++
>  config/s390x/s390x_linux_clang_ubuntu        |  19 +
>  doc/guides/nics/features/i40e.ini            |   1 +
>  drivers/common/mlx5/mlx5_common.h            |   9 +
>  drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
>  drivers/net/i40e/meson.build                 |   2 +
>  drivers/net/ixgbe/ixgbe_rxtx.c               |   8 +-
>  drivers/net/memif/rte_eth_memif.h            |   2 +
>  drivers/net/mlx5/mlx5_rx.c                   |  24 +-
>  drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
>  examples/l3fwd/l3fwd_em.c                    |   8 +
>  examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
>  examples/l3fwd/l3fwd_s390x.h                 | 261 ++++++++
>  lib/acl/acl_bld.c                            |   3 +
>  lib/acl/acl_gen.c                            |   9 +
>  lib/acl/acl_run_scalar.c                     |   8 +
>  lib/acl/rte_acl.c                            |  27 +
>  lib/acl/rte_acl.h                            |   5 +-
>  lib/eal/s390x/include/meson.build            |  16 +
>  lib/eal/s390x/include/rte_atomic.h           |  44 ++
>  lib/eal/s390x/include/rte_byteorder.h        |  43 ++
>  lib/eal/s390x/include/rte_cpuflags.h         |  41 ++
>  lib/eal/s390x/include/rte_cycles.h           |  44 ++
>  lib/eal/s390x/include/rte_io.h               | 184 ++++++
>  lib/eal/s390x/include/rte_mcslock.h          |  18 +
>  lib/eal/s390x/include/rte_memcpy.h           |  55 ++
>  lib/eal/s390x/include/rte_pause.h            |  22 +
>  lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
>  lib/eal/s390x/include/rte_prefetch.h         |  46 ++
>  lib/eal/s390x/include/rte_rwlock.h           |  42 ++
>  lib/eal/s390x/include/rte_spinlock.h         |  85 +++
>  lib/eal/s390x/include/rte_ticketlock.h       |  18 +
>  lib/eal/s390x/include/rte_vect.h             |  35 ++
>  lib/eal/s390x/meson.build                    |  16 +
>  lib/eal/s390x/rte_cpuflags.c                 |  91 +++
>  lib/eal/s390x/rte_cycles.c                   |  11 +
>  lib/eal/s390x/rte_hypervisor.c               |  11 +
>  lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
>  lib/hash/rte_fbk_hash.h                      |   7 +
>  lib/lpm/meson.build                          |   1 +
>  lib/lpm/rte_lpm.h                            |   2 +
>  lib/lpm/rte_lpm6.c                           |  18 +
>  lib/lpm/rte_lpm_s390x.h                      | 130 ++++
>  meson.build                                  |   2 +
>  53 files changed, 2439 insertions(+), 14 deletions(-)

- This is too big to review.
Please split this patch separating the really minimum support (getting
EAL and main libraries to build, disabling the rest that is "broken"
for s390x) then adding more components support in later patches.

RISC V and LoongArch "recent" additions are good examples.
https://patchwork.dpdk.org/project/dpdk/list/?series=23380&state=%2A&archive=both
https://patchwork.dpdk.org/project/dpdk/list/?series=24969&state=%2A&archive=both

- We need one maintainer for this new architecture.

- You'll notice that the DPDK CI reported issues, please fix them.

- What are the plans in terms of CI? We need some compilation testing
and ideally some regular runtime testing.
Maybe you can reach out to IBM PPC DPDK guys, like David Christensen,
to see what they are doing.
  
David Miller Aug. 2, 2023, 3:34 p.m. UTC | #2
Hello,

   I'm happy to split it,  I will resubmit when these changes are made.
I was planning to spend some time to figure out why the CI abi test is
failing / it had previously passed all tests locally.
The (one) long term maintainer will be Mathew S Thoennes <tardis@us.ibm.com>.
I will relay your concerns about CI and have him speak with David Christensen.

  - David Miller

On Wed, Aug 2, 2023 at 10:25 AM David Marchand
<david.marchand@redhat.com> wrote:
>
> Hello David,
>
> On Wed, Jul 26, 2023 at 3:35 AM David Miller <dmiller423@gmail.com> wrote:
> >
> > Minimal changes to drivers and app to support the IBM s390x.
>
> This seems a bit more than "minimal changes" :-).
>
> >
> > Signed-off-by: David Miller <dmiller423@gmail.com>
> > Reviewed-by: Mathew S Thoennes <tardis@us.ibm.com>
> > ---
> >  app/test-acl/main.c                          |   4 +
> >  app/test/test_acl.c                          |   1 +
> >  app/test/test_atomic.c                       |   7 +-
> >  app/test/test_cmdline_ipaddr.c               |  12 +-
> >  app/test/test_cmdline_num.c                  | 110 ++++
> >  app/test/test_hash_functions.c               |  29 +
> >  app/test/test_xmmt_ops.h                     |  14 +
> >  buildtools/pmdinfogen.py                     |  11 +-
> >  config/meson.build                           |   2 +
> >  config/s390x/meson.build                     |  51 ++
> >  config/s390x/s390x_linux_clang_ubuntu        |  19 +
> >  doc/guides/nics/features/i40e.ini            |   1 +
> >  drivers/common/mlx5/mlx5_common.h            |   9 +
> >  drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
> >  drivers/net/i40e/meson.build                 |   2 +
> >  drivers/net/ixgbe/ixgbe_rxtx.c               |   8 +-
> >  drivers/net/memif/rte_eth_memif.h            |   2 +
> >  drivers/net/mlx5/mlx5_rx.c                   |  24 +-
> >  drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
> >  examples/l3fwd/l3fwd_em.c                    |   8 +
> >  examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
> >  examples/l3fwd/l3fwd_s390x.h                 | 261 ++++++++
> >  lib/acl/acl_bld.c                            |   3 +
> >  lib/acl/acl_gen.c                            |   9 +
> >  lib/acl/acl_run_scalar.c                     |   8 +
> >  lib/acl/rte_acl.c                            |  27 +
> >  lib/acl/rte_acl.h                            |   5 +-
> >  lib/eal/s390x/include/meson.build            |  16 +
> >  lib/eal/s390x/include/rte_atomic.h           |  44 ++
> >  lib/eal/s390x/include/rte_byteorder.h        |  43 ++
> >  lib/eal/s390x/include/rte_cpuflags.h         |  41 ++
> >  lib/eal/s390x/include/rte_cycles.h           |  44 ++
> >  lib/eal/s390x/include/rte_io.h               | 184 ++++++
> >  lib/eal/s390x/include/rte_mcslock.h          |  18 +
> >  lib/eal/s390x/include/rte_memcpy.h           |  55 ++
> >  lib/eal/s390x/include/rte_pause.h            |  22 +
> >  lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
> >  lib/eal/s390x/include/rte_prefetch.h         |  46 ++
> >  lib/eal/s390x/include/rte_rwlock.h           |  42 ++
> >  lib/eal/s390x/include/rte_spinlock.h         |  85 +++
> >  lib/eal/s390x/include/rte_ticketlock.h       |  18 +
> >  lib/eal/s390x/include/rte_vect.h             |  35 ++
> >  lib/eal/s390x/meson.build                    |  16 +
> >  lib/eal/s390x/rte_cpuflags.c                 |  91 +++
> >  lib/eal/s390x/rte_cycles.c                   |  11 +
> >  lib/eal/s390x/rte_hypervisor.c               |  11 +
> >  lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
> >  lib/hash/rte_fbk_hash.h                      |   7 +
> >  lib/lpm/meson.build                          |   1 +
> >  lib/lpm/rte_lpm.h                            |   2 +
> >  lib/lpm/rte_lpm6.c                           |  18 +
> >  lib/lpm/rte_lpm_s390x.h                      | 130 ++++
> >  meson.build                                  |   2 +
> >  53 files changed, 2439 insertions(+), 14 deletions(-)
>
> - This is too big to review.
> Please split this patch separating the really minimum support (getting
> EAL and main libraries to build, disabling the rest that is "broken"
> for s390x) then adding more components support in later patches.
>
> RISC V and LoongArch "recent" additions are good examples.
> https://patchwork.dpdk.org/project/dpdk/list/?series=23380&state=%2A&archive=both
> https://patchwork.dpdk.org/project/dpdk/list/?series=24969&state=%2A&archive=both
>
> - We need one maintainer for this new architecture.
>
> - You'll notice that the DPDK CI reported issues, please fix them.
>
> - What are the plans in terms of CI? We need some compilation testing
> and ideally some regular runtime testing.
> Maybe you can reach out to IBM PPC DPDK guys, like David Christensen,
> to see what they are doing.
>
>
> --
> David Marchand
>
  
David Miller Aug. 2, 2023, 3:48 p.m. UTC | #3
It looks like this is still from v2, v3 (fixes the build issue,
missing operator) was submit the same day..
The abi-test failure referenced is present on v3 (which the bot has
only accepted today).
Patch v4 will be split as requested.

Thanks.
  - David Miller

On Wed, Aug 2, 2023 at 10:34 AM David Miller <dmiller423@gmail.com> wrote:
>
> Hello,
>
>    I'm happy to split it,  I will resubmit when these changes are made.
> I was planning to spend some time to figure out why the CI abi test is
> failing / it had previously passed all tests locally.
> The (one) long term maintainer will be Mathew S Thoennes <tardis@us.ibm.com>.
> I will relay your concerns about CI and have him speak with David Christensen.
>
>   - David Miller
>
> On Wed, Aug 2, 2023 at 10:25 AM David Marchand
> <david.marchand@redhat.com> wrote:
> >
> > Hello David,
> >
> > On Wed, Jul 26, 2023 at 3:35 AM David Miller <dmiller423@gmail.com> wrote:
> > >
> > > Minimal changes to drivers and app to support the IBM s390x.
> >
> > This seems a bit more than "minimal changes" :-).
> >
> > >
> > > Signed-off-by: David Miller <dmiller423@gmail.com>
> > > Reviewed-by: Mathew S Thoennes <tardis@us.ibm.com>
> > > ---
> > >  app/test-acl/main.c                          |   4 +
> > >  app/test/test_acl.c                          |   1 +
> > >  app/test/test_atomic.c                       |   7 +-
> > >  app/test/test_cmdline_ipaddr.c               |  12 +-
> > >  app/test/test_cmdline_num.c                  | 110 ++++
> > >  app/test/test_hash_functions.c               |  29 +
> > >  app/test/test_xmmt_ops.h                     |  14 +
> > >  buildtools/pmdinfogen.py                     |  11 +-
> > >  config/meson.build                           |   2 +
> > >  config/s390x/meson.build                     |  51 ++
> > >  config/s390x/s390x_linux_clang_ubuntu        |  19 +
> > >  doc/guides/nics/features/i40e.ini            |   1 +
> > >  drivers/common/mlx5/mlx5_common.h            |   9 +
> > >  drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
> > >  drivers/net/i40e/meson.build                 |   2 +
> > >  drivers/net/ixgbe/ixgbe_rxtx.c               |   8 +-
> > >  drivers/net/memif/rte_eth_memif.h            |   2 +
> > >  drivers/net/mlx5/mlx5_rx.c                   |  24 +-
> > >  drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
> > >  examples/l3fwd/l3fwd_em.c                    |   8 +
> > >  examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
> > >  examples/l3fwd/l3fwd_s390x.h                 | 261 ++++++++
> > >  lib/acl/acl_bld.c                            |   3 +
> > >  lib/acl/acl_gen.c                            |   9 +
> > >  lib/acl/acl_run_scalar.c                     |   8 +
> > >  lib/acl/rte_acl.c                            |  27 +
> > >  lib/acl/rte_acl.h                            |   5 +-
> > >  lib/eal/s390x/include/meson.build            |  16 +
> > >  lib/eal/s390x/include/rte_atomic.h           |  44 ++
> > >  lib/eal/s390x/include/rte_byteorder.h        |  43 ++
> > >  lib/eal/s390x/include/rte_cpuflags.h         |  41 ++
> > >  lib/eal/s390x/include/rte_cycles.h           |  44 ++
> > >  lib/eal/s390x/include/rte_io.h               | 184 ++++++
> > >  lib/eal/s390x/include/rte_mcslock.h          |  18 +
> > >  lib/eal/s390x/include/rte_memcpy.h           |  55 ++
> > >  lib/eal/s390x/include/rte_pause.h            |  22 +
> > >  lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
> > >  lib/eal/s390x/include/rte_prefetch.h         |  46 ++
> > >  lib/eal/s390x/include/rte_rwlock.h           |  42 ++
> > >  lib/eal/s390x/include/rte_spinlock.h         |  85 +++
> > >  lib/eal/s390x/include/rte_ticketlock.h       |  18 +
> > >  lib/eal/s390x/include/rte_vect.h             |  35 ++
> > >  lib/eal/s390x/meson.build                    |  16 +
> > >  lib/eal/s390x/rte_cpuflags.c                 |  91 +++
> > >  lib/eal/s390x/rte_cycles.c                   |  11 +
> > >  lib/eal/s390x/rte_hypervisor.c               |  11 +
> > >  lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
> > >  lib/hash/rte_fbk_hash.h                      |   7 +
> > >  lib/lpm/meson.build                          |   1 +
> > >  lib/lpm/rte_lpm.h                            |   2 +
> > >  lib/lpm/rte_lpm6.c                           |  18 +
> > >  lib/lpm/rte_lpm_s390x.h                      | 130 ++++
> > >  meson.build                                  |   2 +
> > >  53 files changed, 2439 insertions(+), 14 deletions(-)
> >
> > - This is too big to review.
> > Please split this patch separating the really minimum support (getting
> > EAL and main libraries to build, disabling the rest that is "broken"
> > for s390x) then adding more components support in later patches.
> >
> > RISC V and LoongArch "recent" additions are good examples.
> > https://patchwork.dpdk.org/project/dpdk/list/?series=23380&state=%2A&archive=both
> > https://patchwork.dpdk.org/project/dpdk/list/?series=24969&state=%2A&archive=both
> >
> > - We need one maintainer for this new architecture.
> >
> > - You'll notice that the DPDK CI reported issues, please fix them.
> >
> > - What are the plans in terms of CI? We need some compilation testing
> > and ideally some regular runtime testing.
> > Maybe you can reach out to IBM PPC DPDK guys, like David Christensen,
> > to see what they are doing.
> >
> >
> > --
> > David Marchand
> >
  
Mathew S Thoennes Aug. 14, 2023, 1:03 p.m. UTC | #4
David,

I will connect with David Christensen in regards to what they are doing for CI.  I have talked to him in the past but now that things are rolling I will reconnect with him.  I do have hardware that the CI could run on to verify the build.  I think testing of the hardware part would end up being at some regular interval since that will have to occur on internal infrastructure.  

Matt.....

Mathew Thoennes
zSystems Research
T.J. Watson Research Center
tardis@us.ibm.com
(914)945-4545

-----Original Message-----
From: David Marchand <david.marchand@redhat.com> 
Sent: Wednesday, August 2, 2023 11:25 AM
To: David Miller <dmiller423@gmail.com>
Cc: dev@dpdk.org; Mathew S Thoennes <tardis@us.ibm.com>; Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>; Olivier Matz <olivier.matz@6wind.com>; Yipeng Wang <yipeng1.wang@intel.com>; Sameh Gobriel <sameh.gobriel@intel.com>; Bruce Richardson <bruce.richardson@intel.com>; Vladimir Medvedkin <vladimir.medvedkin@intel.com>; Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>; Yuying Zhang <Yuying.Zhang@intel.com>; Beilei Xing <beilei.xing@intel.com>; Matan Azrad <matan@nvidia.com>; Viacheslav Ovsiienko <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>; Suanming Mou <suanmingm@nvidia.com>; Qiming Yang <qiming.yang@intel.com>; Wenjun Wu <wenjun1.wu@intel.com>; Jakub Grajciar <jgrajcia@cisco.com>; Harman Kalra <hkalra@marvell.com>; Thomas Monjalon <thomas@monjalon.net>; David Christensen <drc@linux.vnet.ibm.com>
Subject: [EXTERNAL] Re: [PATCH v3] Add support for IBM Z s390x

Hello David,

On Wed, Jul 26, 2023 at 3:35 AM David Miller <dmiller423@gmail.com> wrote:
>
> Minimal changes to drivers and app to support the IBM s390x.

This seems a bit more than "minimal changes" :-).

>
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Reviewed-by: Mathew S Thoennes <tardis@us.ibm.com>
> ---
>  app/test-acl/main.c                          |   4 +
>  app/test/test_acl.c                          |   1 +
>  app/test/test_atomic.c                       |   7 +-
>  app/test/test_cmdline_ipaddr.c               |  12 +-
>  app/test/test_cmdline_num.c                  | 110 ++++
>  app/test/test_hash_functions.c               |  29 +
>  app/test/test_xmmt_ops.h                     |  14 +
>  buildtools/pmdinfogen.py                     |  11 +-
>  config/meson.build                           |   2 +
>  config/s390x/meson.build                     |  51 ++
>  config/s390x/s390x_linux_clang_ubuntu        |  19 +
>  doc/guides/nics/features/i40e.ini            |   1 +
>  drivers/common/mlx5/mlx5_common.h            |   9 +
>  drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
>  drivers/net/i40e/meson.build                 |   2 +
>  drivers/net/ixgbe/ixgbe_rxtx.c               |   8 +-
>  drivers/net/memif/rte_eth_memif.h            |   2 +
>  drivers/net/mlx5/mlx5_rx.c                   |  24 +-
>  drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
>  examples/l3fwd/l3fwd_em.c                    |   8 +
>  examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
>  examples/l3fwd/l3fwd_s390x.h                 | 261 ++++++++
>  lib/acl/acl_bld.c                            |   3 +
>  lib/acl/acl_gen.c                            |   9 +
>  lib/acl/acl_run_scalar.c                     |   8 +
>  lib/acl/rte_acl.c                            |  27 +
>  lib/acl/rte_acl.h                            |   5 +-
>  lib/eal/s390x/include/meson.build            |  16 +
>  lib/eal/s390x/include/rte_atomic.h           |  44 ++
>  lib/eal/s390x/include/rte_byteorder.h        |  43 ++
>  lib/eal/s390x/include/rte_cpuflags.h         |  41 ++
>  lib/eal/s390x/include/rte_cycles.h           |  44 ++
>  lib/eal/s390x/include/rte_io.h               | 184 ++++++
>  lib/eal/s390x/include/rte_mcslock.h          |  18 +
>  lib/eal/s390x/include/rte_memcpy.h           |  55 ++
>  lib/eal/s390x/include/rte_pause.h            |  22 +
>  lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
>  lib/eal/s390x/include/rte_prefetch.h         |  46 ++
>  lib/eal/s390x/include/rte_rwlock.h           |  42 ++
>  lib/eal/s390x/include/rte_spinlock.h         |  85 +++
>  lib/eal/s390x/include/rte_ticketlock.h       |  18 +
>  lib/eal/s390x/include/rte_vect.h             |  35 ++
>  lib/eal/s390x/meson.build                    |  16 +
>  lib/eal/s390x/rte_cpuflags.c                 |  91 +++
>  lib/eal/s390x/rte_cycles.c                   |  11 +
>  lib/eal/s390x/rte_hypervisor.c               |  11 +
>  lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
>  lib/hash/rte_fbk_hash.h                      |   7 +
>  lib/lpm/meson.build                          |   1 +
>  lib/lpm/rte_lpm.h                            |   2 +
>  lib/lpm/rte_lpm6.c                           |  18 +
>  lib/lpm/rte_lpm_s390x.h                      | 130 ++++
>  meson.build                                  |   2 +
>  53 files changed, 2439 insertions(+), 14 deletions(-)

- This is too big to review.
Please split this patch separating the really minimum support (getting EAL and main libraries to build, disabling the rest that is "broken"
for s390x) then adding more components support in later patches.

RISC V and LoongArch "recent" additions are good examples.
https://patchwork.dpdk.org/project/dpdk/list/?series=23380&state=%2A&archive=both
https://patchwork.dpdk.org/project/dpdk/list/?series=24969&state=%2A&archive=both 

- We need one maintainer for this new architecture.

- You'll notice that the DPDK CI reported issues, please fix them.

- What are the plans in terms of CI? We need some compilation testing and ideally some regular runtime testing.
Maybe you can reach out to IBM PPC DPDK guys, like David Christensen, to see what they are doing.


--
David Marchand
  

Patch

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index 41ce83db08..68bde76e52 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -89,6 +89,10 @@  static const struct acl_alg acl_alg[] = {
 		.name = "altivec",
 		.alg = RTE_ACL_CLASSIFY_ALTIVEC,
 	},
+	{
+		.name = "s390x",
+		.alg = RTE_ACL_CLASSIFY_S390X,
+	},
 	{
 		.name = "avx512x16",
 		.alg = RTE_ACL_CLASSIFY_AVX512X16,
diff --git a/app/test/test_acl.c b/app/test/test_acl.c
index 623f34682e..38e3374644 100644
--- a/app/test/test_acl.c
+++ b/app/test/test_acl.c
@@ -351,6 +351,7 @@  test_classify_run(struct rte_acl_ctx *acx, struct ipv4_7tuple test_data[],
 		RTE_ACL_CLASSIFY_AVX2,
 		RTE_ACL_CLASSIFY_NEON,
 		RTE_ACL_CLASSIFY_ALTIVEC,
+		RTE_ACL_CLASSIFY_S390X,
 		RTE_ACL_CLASSIFY_AVX512X16,
 		RTE_ACL_CLASSIFY_AVX512X32,
 	};
diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index e4b997827e..37ece78425 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -17,6 +17,7 @@ 
 #include <rte_lcore.h>
 #include <rte_random.h>
 #include <rte_hash_crc.h>
+#include <rte_byteorder.h>
 
 #include "test.h"
 
@@ -351,6 +352,7 @@  volatile uint16_t token16;
 volatile uint32_t token32;
 volatile uint64_t token64;
 
+#ifndef RTE_ARCH_S390X
 static void
 build_crc8_table(void)
 {
@@ -441,6 +443,8 @@  test_atomic_exchange(__rte_unused void *arg)
 
 	return 0;
 }
+#endif
+
 static int
 test_atomic(void)
 {
@@ -597,6 +601,7 @@  test_atomic(void)
 	}
 #endif
 
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 	/*
 	 * Test 16/32/64bit atomic exchange.
 	 */
@@ -628,7 +633,7 @@  test_atomic(void)
 		printf("Atomic exchange test failed\n");
 		return -1;
 	}
-
+#endif
 	return 0;
 }
 REGISTER_TEST_COMMAND(atomic_autotest, test_atomic);
diff --git a/app/test/test_cmdline_ipaddr.c b/app/test/test_cmdline_ipaddr.c
index f540063508..717aa84d23 100644
--- a/app/test/test_cmdline_ipaddr.c
+++ b/app/test/test_cmdline_ipaddr.c
@@ -6,19 +6,29 @@ 
 #include <inttypes.h>
 
 #include <rte_string_fns.h>
+#include <rte_byteorder.h>
 
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 
 #include "test_cmdline.h"
 
-#define IP4(a,b,c,d) {.s_addr = (uint32_t)(((a) & 0xff) | \
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define IP4(a, b, c, d) {.s_addr = (uint32_t)(((a) & 0xff) | \
 					   (((b) & 0xff) << 8) | \
 					   (((c) & 0xff) << 16)  | \
 					   ((d) & 0xff)  << 24)}
 
 #define U16_SWAP(x) \
 		(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8))
+#else
+#define IP4(a, b, c, d) {((uint32_t)(((a) & 0xff) << 24) | \
+					   (((b) & 0xff) << 16) | \
+					   (((c) & 0xff) << 8)  | \
+					   ((d) & 0xff))}
+
+#define U16_SWAP(x) x
+#endif
 
 /* create IPv6 address, swapping bytes where needed */
 #ifndef s6_addr16
diff --git a/app/test/test_cmdline_num.c b/app/test/test_cmdline_num.c
index 9276de59bd..4432f48e99 100644
--- a/app/test/test_cmdline_num.c
+++ b/app/test/test_cmdline_num.c
@@ -10,6 +10,7 @@ 
 
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
+#include <rte_byteorder.h>
 
 #include "test_cmdline.h"
 
@@ -438,6 +439,48 @@  test_parse_num_valid(void)
 			/* check if result matches what it should have matched
 			 * since unsigned numbers don't care about number of bits, we can just convert
 			 * everything to uint64_t without any worries. */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+			switch (type) {
+			case RTE_UINT8:
+			{
+				uint8_t *temp = (uint8_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_UINT16:
+			{
+				uint16_t *temp = (uint16_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_UINT32:
+			{
+				uint32_t *temp = (uint32_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT8:
+			{
+				int8_t *temp = (int8_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT16:
+			{
+				int16_t *temp = (int16_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT32:
+			{
+				int32_t *temp = (int32_t *)&result;
+				result = *temp;
+				break;
+			}
+			default:
+				break;
+			}
+#endif
 			if (ret > 0 && num_valid_positive_strs[i].result != result) {
 				printf("Error: parsing %s as %s failed: result mismatch!\n",
 						num_valid_positive_strs[i].str, buf);
@@ -467,6 +510,7 @@  test_parse_num_valid(void)
 			 * the result is signed in this case, so we have to account for that */
 			if (ret > 0) {
 				/* detect negative */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 				switch (type) {
 				case RTE_INT8:
 					result = (int8_t) result;
@@ -480,6 +524,30 @@  test_parse_num_valid(void)
 				default:
 					break;
 				}
+#else
+				switch (type) {
+				case RTE_INT8:
+				{
+					int8_t *temp = (int8_t *)&result;
+					result = *temp;
+					break;
+				}
+				case RTE_INT16:
+				{
+					int16_t *temp = (int16_t *)&result;
+					result = *temp;
+					break;
+				}
+				case RTE_INT32:
+				{
+					int32_t *temp = (int32_t *)&result;
+					result = *temp;
+					break;
+				}
+				default:
+					break;
+				}
+#endif
 				if (num_valid_negative_strs[i].result == (int64_t) result)
 					continue;
 				printf("Error: parsing %s as %s failed: result mismatch!\n",
@@ -516,6 +584,48 @@  test_parse_num_valid(void)
 			/* check if result matches what it should have matched
 			 * since unsigned numbers don't care about number of bits, we can just convert
 			 * everything to uint64_t without any worries. */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+			switch (type) {
+			case RTE_UINT8:
+			{
+				uint8_t *temp = (uint8_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_UINT16:
+			{
+				uint16_t *temp = (uint16_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_UINT32:
+			{
+				uint32_t *temp = (uint32_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT8:
+			{
+				int8_t *temp = (int8_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT16:
+			{
+				int16_t *temp = (int16_t *)&result;
+				result = *temp;
+				break;
+			}
+			case RTE_INT32:
+			{
+				int32_t *temp = (int32_t *)&result;
+				result = *temp;
+				break;
+			}
+			default:
+				break;
+			}
+#endif
 			if (ret > 0 && num_garbage_positive_strs[i].result != result) {
 				printf("Error: parsing %s as %s failed: result mismatch!\n",
 						num_garbage_positive_strs[i].str, buf);
diff --git a/app/test/test_hash_functions.c b/app/test/test_hash_functions.c
index 76d51b6e71..b387d0eabb 100644
--- a/app/test/test_hash_functions.c
+++ b/app/test/test_hash_functions.c
@@ -25,6 +25,7 @@ 
  * e.g.: key size = 4, key = 0x03020100
  *       key size = 8, key = 0x0706050403020100
  */
+#if !defined(RTE_ARCH_S390X)
 static uint32_t hash_values_jhash[2][12] = {{
 	0x8ba9414b, 0xdf0d39c9,
 	0xe4cf1d42, 0xd4ccb93c, 0x5e84eafc, 0x21362cfe,
@@ -51,6 +52,34 @@  static uint32_t hash_values_crc[2][12] = {{
 	0x789c104f, 0x53028d3e
 }
 };
+#else
+static uint32_t hash_values_jhash[2][12] = {{
+	0x8ba9414b, 0x8a2f8eb,
+	0x55dcd60b, 0xf0b95bfe, 0x1a28d94c, 0x003d8f00,
+	0x84c90b2c, 0x24b83acf, 0x5e16af2f, 0x751c9f59,
+	0x665b8254, 0x6e347c81
+},
+{
+	0x5c62c303, 0xb21d4b7b,
+	0xa33cdfcf, 0x47cf3d14, 0x1cae829f, 0x1253a9ea,
+	0x7171efd1, 0xcef21db0, 0x3df3f5fe, 0x35fd67d2,
+	0x2922cbc4, 0xeaee5c5c
+}
+};
+static uint32_t hash_values_crc[2][12] = {{
+	0x00000000, 0x13a29877,
+	0x3eef4343, 0xb6719589, 0x938d3d79, 0xed93196b,
+	0xe710a46c, 0x81f7ab71, 0x702bc9ee, 0x26c72488,
+	0x2e7092a9, 0xf2fbc80b
+},
+{
+	0xbdfd3980, 0x91e95e36,
+	0x37765e57, 0x6559eb17, 0x49c8a164, 0x18daa0d3,
+	0x67065980, 0x62f966d0, 0x4e28a2a0, 0xe342d18f,
+	0x1518c680, 0xebe8026b
+}
+};
+#endif
 
 /*******************************************************************************
  * Hash function performance test configuration section. Each performance test
diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
index 626aa9bcba..f15103719a 100644
--- a/app/test/test_xmmt_ops.h
+++ b/app/test/test_xmmt_ops.h
@@ -52,6 +52,20 @@  vect_set_epi32(int i3, int i2, int i1, int i0)
 	return data;
 }
 
+#elif defined(RTE_ARCH_S390X)
+
+/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
+#define vect_loadu_sil128(p) vec_xld2(0, (signed int *)p)
+
+/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
+static __rte_always_inline xmm_t
+vect_set_epi32(int i3, int i2, int i1, int i0)
+{
+	xmm_t data = (xmm_t){.u32 = {i0, i1, i2, i3}};
+
+	return data;
+}
+
 #elif defined(RTE_ARCH_RISCV)
 
 #define vect_loadu_sil128(p) vect_load_128(p)
diff --git a/buildtools/pmdinfogen.py b/buildtools/pmdinfogen.py
index 2a44f17bda..10467c1a3e 100755
--- a/buildtools/pmdinfogen.py
+++ b/buildtools/pmdinfogen.py
@@ -16,8 +16,15 @@ 
 except ImportError:
     pass
 
-import coff
+try:
+    import coff
+except TypeError:
+    pass
 
+def decode_asciiz(data):
+    index = data.find(b'\x00')
+    end = index if index >= 0 else len(data)
+    return data[:end].decode()
 
 class ELFSymbol:
     def __init__(self, image, symbol):
@@ -28,7 +35,7 @@  def __init__(self, image, symbol):
     def string_value(self):
         size = self._symbol["st_size"]
         value = self.get_value(0, size)
-        return coff.decode_asciiz(value)  # not COFF-specific
+        return decode_asciiz(value)  # not COFF-specific
 
     def get_value(self, offset, size):
         section = self._symbol["st_shndx"]
diff --git a/config/meson.build b/config/meson.build
index d8223718e4..b308d1ae75 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -121,6 +121,8 @@  if cpu_instruction_set == 'generic'
         cpu_instruction_set = 'generic'
     elif host_machine.cpu_family().startswith('ppc')
         cpu_instruction_set = 'power8'
+    elif host_machine.cpu_family().startswith('s390x')
+        machine = 'z13'
     elif host_machine.cpu_family().startswith('riscv')
         cpu_instruction_set = 'riscv'
     endif
diff --git a/config/s390x/meson.build b/config/s390x/meson.build
new file mode 100644
index 0000000000..b15e74ba44
--- /dev/null
+++ b/config/s390x/meson.build
@@ -0,0 +1,51 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2019, 2020
+
+if not dpdk_conf.get('RTE_ARCH_64')
+	error('Only 64-bit compiles are supported for this platform type')
+endif
+dpdk_conf.set('RTE_ARCH', 's390x')
+dpdk_conf.set('RTE_ARCH_S390X', 1)
+dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)
+
+# overrides specific to s390x
+dpdk_conf.set('RTE_MAX_LCORE', 256)
+dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+dpdk_conf.set('RTE_CACHE_LINE_SIZE', 128)
+
+
+
+# default to z13
+cpu_instruction_set = 'z13'
+
+# test compiler support
+cc_march_z14 = cc.has_argument('-march=z14')
+cc_march_z15 = cc.has_argument('-march=z15')
+
+
+machine_args = ['-march=' + cpu_instruction_set, '-mtune=' + cpu_instruction_set]
+
+dpdk_conf.set('RTE_MACHINE','s390x')
+dpdk_conf.set('RTE_MACHINE_CPUFLAG_ZARCH', 1)   # should this be z# 13 ?
+#dpdk_conf.set('RTE_MACHINE', cpu_instruction_set)
+
+if (cc.get_define('__s390x__', args: machine_args) != '')
+    compile_time_cpuflags += ['RTE_MACHINE_CPUFLAG_ZARCH']
+endif
+
+
+# Suppress the gcc warning "note: the layout of aggregates containing
+# vectors with 4-byte alignment has changed in GCC 5".
+if (cc.get_id() == 'gcc' and cc.version().version_compare('>=10.0') and
+        cc.version().version_compare('<12.0') and cc.has_argument('-Wno-psabi'))
+    add_project_arguments('-Wno-psabi', language: 'c')
+endif
+
+
+
+
+
+
+
+
+
diff --git a/config/s390x/s390x_linux_clang_ubuntu b/config/s390x/s390x_linux_clang_ubuntu
new file mode 100644
index 0000000000..952d1ce460
--- /dev/null
+++ b/config/s390x/s390x_linux_clang_ubuntu
@@ -0,0 +1,19 @@ 
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+llvm-config = 'llvm-config'
+pcap-config = 'llvm-config'
+pkgconfig = 'pkg-config'
+
+[host_machine]
+system = 'linux'
+cpu_family = 's390x'
+cpu = 'z13'
+endian = 'big'
+
+[properties]
+platform = 'generic'
+c_args = ['-target', 'aarch64-linux-gnu', '--sysroot', '/usr/aarch64-linux-gnu']
+c_link_args = ['-target', 'aarch64-linux-gnu', '-fuse-ld=lld', '--gcc-toolchain=/usr']
diff --git a/doc/guides/nics/features/i40e.ini b/doc/guides/nics/features/i40e.ini
index e241dad047..a6a6e280af 100644
--- a/doc/guides/nics/features/i40e.ini
+++ b/doc/guides/nics/features/i40e.ini
@@ -50,6 +50,7 @@  x86-32               = Y
 x86-64               = Y
 ARMv8                = Y
 Power8               = Y
+s390x                = Y
 
 [rte_flow items]
 ah                   = Y
diff --git a/drivers/common/mlx5/mlx5_common.h b/drivers/common/mlx5/mlx5_common.h
index 28f9f41528..ea6f81f0d5 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -21,6 +21,11 @@ 
 #include <rte_spinlock.h>
 #include <rte_os_shim.h>
 
+/* s390x pci implementation. */
+#ifdef RTE_MACHINE_CPUFLAG_ZARCH
+#include <rte_io.h>
+#endif
+
 #include "mlx5_prm.h"
 #include "mlx5_devx_cmds.h"
 #include "mlx5_common_os.h"
@@ -407,7 +412,11 @@  mlx5_doorbell_ring(struct mlx5_uar_data *uar, uint64_t val, uint32_t index,
 	/* Ensure ordering between DB record actual update and UAR access. */
 	rte_wmb();
 #ifdef RTE_ARCH_64
+# ifndef RTE_MACHINE_CPUFLAG_ZARCH
 	*uar->db = val;
+# else
+	rte_write64_relaxed(val, uar->db);
+# endif
 #else /* !RTE_ARCH_64 */
 	rte_spinlock_lock(uar->sl_p);
 	*(volatile uint32_t *)uar->db = val;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_s390x.c b/drivers/net/i40e/i40e_rxtx_vec_s390x.c
new file mode 100644
index 0000000000..cafa0b9844
--- /dev/null
+++ b/drivers/net/i40e/i40e_rxtx_vec_s390x.c
@@ -0,0 +1,630 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+
+#include <stdint.h>
+#include <vecintrin.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "base/i40e_prototype.h"
+#include "base/i40e_type.h"
+#include "i40e_ethdev.h"
+#include "i40e_rxtx.h"
+#include "i40e_rxtx_vec_common.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+typedef unsigned long long vector_unsigned_long_long
+	__attribute__((vector_size(2 * sizeof(unsigned long long))));
+typedef unsigned int vector_unsigned_int
+	__attribute__((vector_size(4 * sizeof(unsigned int))));
+typedef unsigned short vector_unsigned_short
+	__attribute__((vector_size(8 * sizeof(unsigned short))));
+typedef unsigned char vector_unsigned_char
+	__attribute__((vector_size(16 * sizeof(unsigned char))));
+
+
+static inline void
+i40e_rxq_rearm(struct i40e_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union i40e_rx_desc *rxdp;
+
+	struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+
+	vector_unsigned_long_long hdr_room = (vector_unsigned_long_long){
+						RTE_PKTMBUF_HEADROOM,
+						RTE_PKTMBUF_HEADROOM};
+	vector_unsigned_long_long dma_addr0, dma_addr1;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mp,
+				 (void *)rxep,
+				 RTE_I40E_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			dma_addr0 = (vector_unsigned_long_long){};
+			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				vec_xstd2(dma_addr0, 0,
+					(unsigned long long *)&rxdp[i].read);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_I40E_RXQ_REARM_THRESH;
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		vector_unsigned_long_long vaddr0, vaddr1;
+		uintptr_t p0, p1;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		 /* Flush mbuf with pkt template.
+		  * Data to be rearmed is 6 bytes long.
+		  * Though, RX will overwrite ol_flags that are coming next
+		  * anyway. So overwrite whole 8 bytes with one load:
+		  * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+		  */
+		p0 = (uintptr_t)&mb0->rearm_data;
+		*(uint64_t *)p0 = rxq->mbuf_initializer;
+		p1 = (uintptr_t)&mb1->rearm_data;
+		*(uint64_t *)p1 = rxq->mbuf_initializer;
+
+		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+		vaddr0 = vec_xld2(0, (unsigned long long *)&mb0->buf_addr);
+		vaddr1 = vec_xld2(0, (unsigned long long *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = vec_mergel(vaddr0, vaddr0);
+		dma_addr1 = vec_mergel(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = dma_addr0 + hdr_room;
+		dma_addr1 = dma_addr1 + hdr_room;
+
+		/* flush desc with pa dma_addr */
+		vec_xstd2(dma_addr0, 0, (unsigned long long *)&rxdp++->read);
+		vec_xstd2(dma_addr1, 0, (unsigned long long *)&rxdp++->read);
+	}
+
+	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+}
+
+static inline void
+desc_to_olflags_v(vector_unsigned_long_long descs[4], struct rte_mbuf **rx_pkts)
+{
+	vector_unsigned_int vlan0, vlan1, rss, l3_l4e;
+
+	/* mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication.
+	 */
+	const vector_unsigned_int rss_vlan_msk = (vector_unsigned_int){
+			(int32_t)0x1c03804, (int32_t)0x1c03804,
+			(int32_t)0x1c03804, (int32_t)0x1c03804};
+
+	/* map rss and vlan type to rss hash and vlan flag */
+	const vector_unsigned_char vlan_flags = (vector_unsigned_char){
+			0, 0, 0, 0,
+			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, 0};
+
+	const vector_unsigned_char rss_flags = (vector_unsigned_char){
+			0, PKT_RX_FDIR, 0, 0,
+			0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
+			0, 0, 0, 0,
+			0, 0, 0, 0};
+
+	const vector_unsigned_char l3_l4e_flags = (vector_unsigned_char){
+			0,
+			PKT_RX_IP_CKSUM_BAD,
+			PKT_RX_L4_CKSUM_BAD,
+			PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
+			PKT_RX_IP_CKSUM_BAD,
+			PKT_RX_IP_CKSUM_BAD,
+			PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
+			PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
+			0, 0, 0, 0, 0, 0, 0, 0};
+
+	vlan0 = (vector_unsigned_int)vec_mergel(descs[0], descs[1]);
+	vlan1 = (vector_unsigned_int)vec_mergel(descs[2], descs[3]);
+	vlan0 = (vector_unsigned_int)vec_mergeh(vlan0, vlan1);
+
+	vlan1 = vec_and(vlan0, rss_vlan_msk);
+	vlan0 = (vector_unsigned_int)vec_perm(vlan_flags,
+					(vector_unsigned_char){},
+					*(vector_unsigned_char *)&vlan1);
+
+	rss[0] = (uint32_t)vlan1[0] >> 11;
+	rss[1] = (uint32_t)vlan1[1] >> 11;
+	rss[2] = (uint32_t)vlan1[2] >> 11;
+	rss[3] = (uint32_t)vlan1[3] >> 11;
+	rss = (vector_unsigned_int)vec_perm(rss_flags, (vector_unsigned_char){},
+					*(vector_unsigned_char *)&rss);
+
+	l3_l4e[0] = (uint32_t)vlan1[0] >> 22;
+	l3_l4e[1] = (uint32_t)vlan1[1] >> 22;
+	l3_l4e[2] = (uint32_t)vlan1[2] >> 22;
+	l3_l4e[3] = (uint32_t)vlan1[3] >> 22;
+
+	l3_l4e = (vector_unsigned_int)vec_perm(l3_l4e_flags,
+					(vector_unsigned_char){},
+					*(vector_unsigned_char *)&l3_l4e);
+
+	vlan0 = vec_or(vlan0, rss);
+	vlan0 = vec_or(vlan0, l3_l4e);
+
+	rx_pkts[0]->ol_flags = (uint64_t)vlan0[2];
+	rx_pkts[1]->ol_flags = (uint64_t)vlan0[3];
+	rx_pkts[2]->ol_flags = (uint64_t)vlan0[0];
+	rx_pkts[3]->ol_flags = (uint64_t)vlan0[1];
+}
+
+#define PKTLEN_SHIFT     10
+
+static inline void
+desc_to_ptype_v(vector_unsigned_long_long descs[4], struct rte_mbuf **rx_pkts,
+		uint32_t *ptype_tbl)
+{
+	vector_unsigned_long_long ptype0 = vec_mergel(descs[0], descs[1]);
+	vector_unsigned_long_long ptype1 = vec_mergel(descs[2], descs[3]);
+
+	ptype0[0] = ptype0[0] >> 30;
+	ptype0[1] = ptype0[1] >> 30;
+
+	ptype1[0] = ptype1[0] >> 30;
+	ptype1[1] = ptype1[1] >> 30;
+
+	rx_pkts[0]->packet_type =
+		ptype_tbl[(*(vector_unsigned_char *)&ptype0)[0]];
+	rx_pkts[1]->packet_type =
+		ptype_tbl[(*(vector_unsigned_char *)&ptype0)[8]];
+	rx_pkts[2]->packet_type =
+		ptype_tbl[(*(vector_unsigned_char *)&ptype1)[0]];
+	rx_pkts[3]->packet_type =
+		ptype_tbl[(*(vector_unsigned_char *)&ptype1)[8]];
+}
+
+ /* Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+static inline uint16_t
+_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	vector_unsigned_char shuf_msk;
+	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+
+	vector_unsigned_short crc_adjust = (vector_unsigned_short){
+		0, 0,         /* ignore pkt_type field */
+		rxq->crc_len, /* sub crc on pkt_len */
+		0,            /* ignore high-16bits of pkt_len */
+		rxq->crc_len, /* sub crc on data_len */
+		0, 0, 0       /* ignore non-length fields */
+		};
+	vector_unsigned_long_long dd_check, eop_check;
+
+	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles
+	 */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
+		i40e_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
+		return 0;
+
+	/* 4 packets DD mask */
+	dd_check = (vector_unsigned_long_long){0x0000000100000001ULL,
+					  0x0000000100000001ULL};
+
+	/* 4 packets EOP mask */
+	eop_check = (vector_unsigned_long_long){0x0000000200000002ULL,
+					   0x0000000200000002ULL};
+
+	/* mask to shuffle from desc. to mbuf */
+	shuf_msk = (vector_unsigned_char){
+		0xFF, 0xFF,   /* pkt_type set as unknown */
+		0xFF, 0xFF,   /* pkt_type set as unknown */
+		14, 15,       /* octet 15~14, low 16 bits pkt_len */
+		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
+		14, 15,       /* octet 15~14, 16 bits data_len */
+		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
+		4, 5, 6, 7    /* octet 4~7, 32bits rss */
+		};
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache
+	 */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * [A*. mask out 4 unused dirty field in desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+			pos += RTE_I40E_DESCS_PER_LOOP,
+			rxdp += RTE_I40E_DESCS_PER_LOOP) {
+		vector_unsigned_long_long descs[RTE_I40E_DESCS_PER_LOOP];
+		vector_unsigned_char pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		vector_unsigned_short staterr, sterr_tmp1, sterr_tmp2;
+		vector_unsigned_long_long mbp1, mbp2;  /* two mbuf pointer
+							* in one XMM reg.
+							*/
+
+		/* B.1 load 1 mbuf point */
+		mbp1 = *(vector_unsigned_long_long *)&sw_ring[pos];
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] = *(vector_unsigned_long_long *)(rxdp + 3);
+		rte_compiler_barrier();
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		*(vector_unsigned_long_long *)&rx_pkts[pos] = mbp1;
+
+		/* B.1 load 1 mbuf point */
+		mbp2 = *(vector_unsigned_long_long *)&sw_ring[pos + 2];
+
+		descs[2] = *(vector_unsigned_long_long *)(rxdp + 2);
+		rte_compiler_barrier();
+		/* B.1 load 2 mbuf point */
+		descs[1] = *(vector_unsigned_long_long *)(rxdp + 1);
+		rte_compiler_barrier();
+		descs[0] = *(vector_unsigned_long_long *)(rxdp);
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		*(vector_unsigned_long_long *)&rx_pkts[pos + 2] =  mbp2;
+
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		/* avoid compiler reorder optimization */
+		rte_compiler_barrier();
+
+		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
+		vector_unsigned_int len3_temp = vec_xld2(0,
+				(unsigned int *)&descs[3]);
+		len3_temp[3] = len3_temp[3] << PKTLEN_SHIFT;
+		const vector_unsigned_int len3 = len3_temp;
+
+		vector_unsigned_int len2_temp = vec_xld2(0,
+				(unsigned int *)&descs[2]);
+		len2_temp[3] = len2_temp[3] << PKTLEN_SHIFT;
+		const vector_unsigned_int len2 = len2_temp;
+
+		/* merge the now-aligned packet length fields back in */
+		descs[3] = (vector_unsigned_long_long)len3;
+		descs[2] = (vector_unsigned_long_long)len2;
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = vec_perm((vector_unsigned_char)descs[3],
+				  (vector_unsigned_char){}, shuf_msk);
+		pkt_mb3 = vec_perm((vector_unsigned_char)descs[2],
+				  (vector_unsigned_char){}, shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = vec_mergel((vector_unsigned_short)descs[3],
+					(vector_unsigned_short)descs[2]);
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = vec_mergel((vector_unsigned_short)descs[1],
+					(vector_unsigned_short)descs[0]);
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		pkt_mb4 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb4
+				- crc_adjust);
+		pkt_mb3 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb3
+				- crc_adjust);
+
+		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
+		const vector_unsigned_int len1 =
+			vec_sll(vec_xld2(0, (unsigned int *)&descs[1]),
+			(vector_unsigned_int){0, 0, 0, PKTLEN_SHIFT});
+		const vector_unsigned_int len0 =
+			vec_sll(vec_xld2(0, (unsigned int *)&descs[0]),
+			(vector_unsigned_int){0, 0, 0, PKTLEN_SHIFT});
+
+		/* merge the now-aligned packet length fields back in */
+		descs[1] = (vector_unsigned_long_long)len1;
+		descs[0] = (vector_unsigned_long_long)len0;
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = vec_perm((vector_unsigned_char)descs[1],
+				   (vector_unsigned_char){}, shuf_msk);
+		pkt_mb1 = vec_perm((vector_unsigned_char)descs[0],
+				   (vector_unsigned_char){}, shuf_msk);
+
+		/* C.2 get 4 pkts staterr value  */
+		staterr = (vector_unsigned_short)vec_mergeh(sterr_tmp1,
+				sterr_tmp2);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		vec_xstd2(pkt_mb4, 0, (unsigned char *)&rx_pkts[pos + 3]
+			->rx_descriptor_fields1);
+		vec_xstd2(pkt_mb3, 0, (unsigned char *)&rx_pkts[pos + 2]
+			->rx_descriptor_fields1);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		pkt_mb2 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb2
+				- crc_adjust);
+		pkt_mb1 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb1
+				- crc_adjust);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			vector_unsigned_char eop_shuf_mask =
+				(vector_unsigned_char){
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0x04, 0x0C, 0x00, 0x08
+				};
+
+			/* and with mask to extract bits, flipping 1-0 */
+			vector_unsigned_char eop_bits =
+				vec_and((vector_unsigned_char)vec_nor(staterr,
+				staterr), (vector_unsigned_char)eop_check);
+			/* the staterr values are not in order, as the
+			 * count of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit
+			 */
+			eop_bits = vec_perm(eop_bits, (vector_unsigned_char){},
+					    eop_shuf_mask);
+			/* store the resulting 32-bit value */
+			*split_packet = (vec_xld2(0,
+					 (unsigned int *)&eop_bits))[0];
+			split_packet += RTE_I40E_DESCS_PER_LOOP;
+
+			/* zero-out next pointers */
+			rx_pkts[pos]->next = NULL;
+			rx_pkts[pos + 1]->next = NULL;
+			rx_pkts[pos + 2]->next = NULL;
+			rx_pkts[pos + 3]->next = NULL;
+		}
+
+		/* C.3 calc available number of desc */
+		staterr = vec_and(staterr, (vector_unsigned_short)dd_check);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		vec_xstd2(pkt_mb2, 0, (unsigned char *)&rx_pkts[pos + 1]
+			->rx_descriptor_fields1);
+		vec_xstd2(pkt_mb1, 0, (unsigned char *)&rx_pkts[pos]
+			->rx_descriptor_fields1);
+
+		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+		desc_to_olflags_v(descs, &rx_pkts[pos]);
+
+		/* C.4 calc avaialbe number of desc */
+		var = __builtin_popcountll((vec_xld2(0,
+			(unsigned long long *)&staterr)[0]));
+		nb_pkts_recd += var;
+		if (likely(var != RTE_I40E_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+ /* Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+uint16_t
+i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+ /* vPMD receive routine that reassembles scattered packets
+  * Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
+{
+	struct i40e_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	if (rxq->pkt_first_seg == NULL &&
+	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
+	    split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+static inline void
+vtx1(volatile struct i40e_tx_desc *txdp,
+	struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
+		((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
+
+	vector_unsigned_long_long descriptor = (vector_unsigned_long_long){
+		pkt->buf_iova + pkt->data_off, high_qw};
+	*(vector_unsigned_long_long *)txdp = descriptor;
+}
+
+static inline void
+vtx(volatile struct i40e_tx_desc *txdp,
+	struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		vtx1(txdp, *pkt, flags);
+}
+
+uint16_t
+i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+	volatile struct i40e_tx_desc *txdp;
+	struct i40e_tx_entry *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = I40E_TD_CMD;
+	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		i40e_tx_free_bufs(txq);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			vtx1(txdp, *tx_pkts, flags);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->tx_ring[tx_id];
+		txep = &txq->sw_ring[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
+						I40E_TXD_QW1_CMD_SHIFT);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+void __rte_cold
+i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)
+{
+	_i40e_rx_queue_release_mbufs_vec(rxq);
+}
+
+int __rte_cold
+i40e_rxq_vec_setup(struct i40e_rx_queue *rxq)
+{
+	return i40e_rxq_vec_setup_default(rxq);
+}
+
+int __rte_cold
+i40e_txq_vec_setup(struct i40e_tx_queue __rte_unused * txq)
+{
+	return 0;
+}
+
+int __rte_cold
+i40e_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
+{
+	return i40e_rx_vec_dev_conf_condition_check_default(dev);
+}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index 8e53b87a65..d60f4a27a6 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -91,6 +91,8 @@  if arch_subdir == 'x86'
     endif
 elif arch_subdir == 'ppc'
        sources += files('i40e_rxtx_vec_altivec.c')
+elif arch_subdir == 's390x'
+       sources += files('i40e_rxtx_vec_s390x.c')
 elif arch_subdir == 'arm'
        sources += files('i40e_rxtx_vec_neon.c')
 endif
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 954ef241a0..bfdaa7c66e 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -6029,11 +6029,11 @@  ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	return 0;
 }
 
-/* Stubs needed for linkage when RTE_ARCH_PPC_64, RTE_ARCH_RISCV or
- * RTE_ARCH_LOONGARCH is set.
+/* Stubs needed for linkage when RTE_ARCH_PPC_64, RTE_ARCH_RISCV,
+ * RTE_ARCH_LOONGARCH or RTE_ARCH_S390X is set.
  */
-#if defined(RTE_ARCH_PPC_64) || defined(RTE_ARCH_RISCV) || \
-	defined(RTE_ARCH_LOONGARCH)
+#if defined(RTE_ARCH_PPC_64)     || defined(RTE_ARCH_S390X) || \
+	defined(RTE_ARCH_LOONGARCH)  || defined(RTE_ARCH_RISCV)
 int
 ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev __rte_unused *dev)
 {
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index f21806c811..0c4ffa92b9 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -182,6 +182,8 @@  const char *memif_version(void);
 #define __NR_memfd_create 279
 #elif defined __powerpc__
 #define __NR_memfd_create 360
+#elif defined __s390x__
+#define __NR_memfd_create 350
 #elif defined __i386__
 #define __NR_memfd_create 356
 #elif defined __riscv
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index 392784050f..4af08e68dc 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -229,6 +229,8 @@  mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
 #elif defined RTE_ARCH_PPC_64
 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
+#elif defined RTE_ARCH_S390X
+		snprintf(mode->info, sizeof(mode->info), "%s", "Vector S390X");
 #else
 		return -EINVAL;
 #endif
@@ -239,6 +241,8 @@  mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
 #elif defined RTE_ARCH_PPC_64
 		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
+#elif defined RTE_ARCH_S390X
+		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector S390X");
 #else
 		return -EINVAL;
 #endif
@@ -340,12 +344,24 @@  rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 	uint8_t ptype;
 	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+	/*
+	 * hdr_type_etc is from the cqe thus it is BE
+	 * the logic below did not convert BE -> LE prior
+	 * to using the value of it.  So the logic below
+	 * is written for LE thus the value of hdr_type_etc has
+	 * to be converted from LE to BE for the logic to work
+	 */
+	uint16_t cqe_t_le  = rte_le_to_cpu_16(cqe->hdr_type_etc);
+	uint16_t mcqe_t_le;
+
 	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
 	if (mcqe == NULL ||
-	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
-		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
-	else
-		ptype = mcqe->hdr_type >> 2;
+		rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+		ptype = (cqe_t_le & 0xfc00) >> 10;
+	} else {
+		mcqe_t_le = rte_le_to_cpu_16(mcqe->hdr_type);
+		ptype = mcqe_t_le >> 2;
+	}
 	/*
 	 * The index to the array should have:
 	 * bit[1:0] = l3_hdr_type
diff --git a/drivers/net/octeontx/base/octeontx_pki_var.h b/drivers/net/octeontx/base/octeontx_pki_var.h
index 4445369ce7..b37d79eb83 100644
--- a/drivers/net/octeontx/base/octeontx_pki_var.h
+++ b/drivers/net/octeontx/base/octeontx_pki_var.h
@@ -157,6 +157,12 @@  typedef union octtx_wqe_s {
 			uint64_t	lbptr : 8;
 			uint64_t	laptr : 8;
 		} w4;
+
+		struct {
+			uint64_t	size  :16;
+			uint64_t	dwd   : 1;
+			uint64_t	rsvd0 :47;
+		} w5;
 #endif
 	} s;
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 476ac0c54f..24e1b93f42 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -239,6 +239,14 @@  em_mask_key(void *key, xmm_t mask)
 
 	return vec_and(data, mask);
 }
+#elif defined(__s390x__)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	xmm_t data = (xmm_t) vec_xld2(0, (unsigned int *)(key));
+
+	return data + mask;
+}
 #elif defined(RTE_ARCH_RISCV)
 static inline xmm_t
 em_mask_key(void *key, xmm_t mask)
diff --git a/examples/l3fwd/l3fwd_lpm_s390x.h b/examples/l3fwd/l3fwd_lpm_s390x.h
new file mode 100644
index 0000000000..858f696ba9
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_s390x.h
@@ -0,0 +1,137 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+#ifndef __L3FWD_LPM_S390X_H__
+#define __L3FWD_LPM_S390X_H__
+
+#include "l3fwd_s390x.h"
+
+typedef unsigned char vector_unsigned_char
+	__attribute__((vector_size(16*sizeof(unsigned char))));
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		vector_unsigned_int *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	uint32_t x0, x1, x2, x3;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x0 = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	rte_compiler_barrier();
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x1 = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	rte_compiler_barrier();
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x2 = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	rte_compiler_barrier();
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x3 = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	rte_compiler_barrier();
+	dip[0] = (vector_unsigned_int){x0, x1, x2, x3};
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		vector_unsigned_int dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+	const vector_unsigned_char bswap_mask = (vector_unsigned_char){
+							3, 2, 1, 0,
+							7, 6, 5, 4,
+							11, 10, 9, 8,
+							15, 14, 13, 12};
+
+	/* Byte swap 4 IPV4 addresses. */
+	dip = (vector_unsigned_int)vec_perm(*(vector_unsigned_char *)&dip,
+					(vector_unsigned_char){}, bswap_mask);
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, (xmm_t)dip,
+			(uint32_t *)&dst, portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		dst.x = (xmm_t)vec_packs(dst.x, dst.x);
+		*(uint64_t *)dprt = dst.u64[0];
+	} else {
+		dst.x = (xmm_t)dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+							dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+							dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+							dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+							dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t j;
+	uint16_t dst_port[MAX_PKT_BURST];
+	vector_unsigned_int dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+				&ipv4_flag[j / FWDSTEP]);
+
+	for (j = 0; j != k; j += FWDSTEP)
+		processx4_step2(qconf, dip[j / FWDSTEP],
+				ipv4_flag[j / FWDSTEP],
+				portid, &pkts_burst[j], &dst_port[j]);
+
+	/* Classify last up to 3 packets one by one */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fall-through */
+	case 2:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fall-through */
+	case 1:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fall-through */
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_S390X_H__ */
diff --git a/examples/l3fwd/l3fwd_s390x.h b/examples/l3fwd/l3fwd_s390x.h
new file mode 100644
index 0000000000..84baf6ce52
--- /dev/null
+++ b/examples/l3fwd/l3fwd_s390x.h
@@ -0,0 +1,261 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+#ifndef _L3FWD_S390X_H_
+#define _L3FWD_S390X_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/* Vector Shift Right by Octet */
+#define vec_sro(a, b) vec_srb(a, (b) << 64)
+
+typedef unsigned int vector_unsigned_int
+	__attribute__((vector_size(4*sizeof(unsigned int))));
+typedef unsigned short vector_unsigned_short
+	__attribute__((vector_size(8*sizeof(unsigned short))));
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	vector_unsigned_int te[FWDSTEP];
+	vector_unsigned_int ve[FWDSTEP];
+	vector_unsigned_int *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], vector_unsigned_int *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], vector_unsigned_int *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], vector_unsigned_int *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], vector_unsigned_int *);
+
+	ve[0] = (vector_unsigned_int)val_eth[dst_port[0]];
+	te[0] = *p[0];
+
+	ve[1] = (vector_unsigned_int)val_eth[dst_port[1]];
+	te[1] = *p[1];
+
+	ve[2] = (vector_unsigned_int)val_eth[dst_port[2]];
+	te[2] = *p[2];
+
+	ve[3] = (vector_unsigned_int)val_eth[dst_port[3]];
+	te[3] = *p[3];
+
+	/* Update first 12 bytes, keep rest bytes intact. */
+	te[0] = (vector_unsigned_int)vec_sel(
+			(vector_unsigned_short)ve[0],
+			(vector_unsigned_short)te[0],
+			(vector_unsigned_short) {0, 0, 0, 0,
+						0, 0, 0xffff, 0xffff});
+
+	te[1] = (vector_unsigned_int)vec_sel(
+			(vector_unsigned_short)ve[1],
+			(vector_unsigned_short)te[1],
+			(vector_unsigned_short) {0, 0, 0, 0,
+						0, 0, 0xffff, 0xffff});
+
+	te[2] = (vector_unsigned_int)vec_sel(
+			(vector_unsigned_short)ve[2],
+			(vector_unsigned_short)te[2],
+			(vector_unsigned_short) {0, 0, 0, 0, 0,
+						0, 0xffff, 0xffff});
+
+	te[3] = (vector_unsigned_int)vec_sel(
+			(vector_unsigned_short)ve[3],
+			(vector_unsigned_short)te[3],
+			(vector_unsigned_short) {0, 0, 0, 0,
+						0, 0, 0xffff, 0xffff});
+
+	*p[0] = te[0];
+	*p[1] = te[1];
+	*p[2] = te[2];
+	*p[3] = te[3];
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, vector_unsigned_short dp1,
+	vector_unsigned_short dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+
+	v = vec_any_eq(dp1, dp2);
+
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	vector_unsigned_int te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = *(vector_unsigned_int *)eth_hdr;
+	ve = (vector_unsigned_int)val_eth[dst_port[0]];
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	/* dynamically vec_sel te and ve for MASK_ETH (0x3f) */
+	te = (vector_unsigned_int)vec_sel(
+		(vector_unsigned_short)ve,
+		(vector_unsigned_short)te,
+		(vector_unsigned_short){0, 0, 0, 0,
+					0, 0, 0xffff, 0xffff});
+
+	*(vector_unsigned_int *)eth_hdr = te;
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		vector_unsigned_short dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = *(vector_unsigned_short *)dst_port;
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = *((vector_unsigned_short *)
+					&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vec_sro(dp2, (vector unsigned char) {
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, (FWDSTEP - 1) * sizeof(dst_port[0])});
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vec_perm(dp1, (vector_unsigned_short){},
+				(vector unsigned char) {0xf9});
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fall-through */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fall-through */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_S390X_H_ */
diff --git a/lib/acl/acl_bld.c b/lib/acl/acl_bld.c
index 2816632803..7a07fca0b9 100644
--- a/lib/acl/acl_bld.c
+++ b/lib/acl/acl_bld.c
@@ -780,6 +780,9 @@  acl_build_reset(struct rte_acl_ctx *ctx)
 		sizeof(*ctx) - offsetof(struct rte_acl_ctx, num_categories));
 }
 
+
+
+
 static void
 acl_gen_full_range(struct acl_build_context *context, struct rte_acl_node *root,
 	struct rte_acl_node *end, int size, int level)
diff --git a/lib/acl/acl_gen.c b/lib/acl/acl_gen.c
index 25e75fec35..805747a17b 100644
--- a/lib/acl/acl_gen.c
+++ b/lib/acl/acl_gen.c
@@ -360,7 +360,16 @@  acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 		array_ptr = &node_array[index->quad_index];
 		acl_add_ptrs(node, array_ptr, no_match, 0);
 		qtrp = (uint32_t *)node->transitions;
+
+		/* Swap qtrp on big endian that transitions[0]
+		 * is at least signifcant byte.
+		 */
+#if __BYTE_ORDER == __ORDER_BIG_ENDIAN__
+		node->node_index = __bswap_32(qtrp[0]);
+#else
 		node->node_index = qtrp[0];
+#endif
+
 		node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
 		node->node_index |= index->quad_index | node->node_type;
 		index->quad_index += node->fanout;
diff --git a/lib/acl/acl_run_scalar.c b/lib/acl/acl_run_scalar.c
index 3d61e79409..9f01ef8d8c 100644
--- a/lib/acl/acl_run_scalar.c
+++ b/lib/acl/acl_run_scalar.c
@@ -141,6 +141,14 @@  rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		input0 = GET_NEXT_4BYTES(parms, 0);
 		input1 = GET_NEXT_4BYTES(parms, 1);
 
+		/* input needs to be swapped because the rules get
+		 * swapped while building the trie.
+		 */
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+		input0 = __bswap_32(input0);
+		input1 = __bswap_32(input1);
+#endif
+
 		for (n = 0; n < 4; n++) {
 
 			transition0 = scalar_transition(flows.trans,
diff --git a/lib/acl/rte_acl.c b/lib/acl/rte_acl.c
index a61c3ba188..8ce8cfbec5 100644
--- a/lib/acl/rte_acl.c
+++ b/lib/acl/rte_acl.c
@@ -101,6 +101,8 @@  static const rte_acl_classify_t classify_fns[] = {
 	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
 	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
 	[RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec,
+	/* use scalar for s390x for now */
+	[RTE_ACL_CLASSIFY_S390X] = rte_acl_classify_scalar,
 	[RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16,
 	[RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32,
 };
@@ -145,6 +147,27 @@  acl_check_alg_ppc(enum rte_acl_classify_alg alg)
 	return -EINVAL;
 }
 
+
+
+/*
+ * Helper function for acl_check_alg.
+ * Check support for PPC specific classify methods.
+ */
+static int
+acl_check_alg_s390x(enum rte_acl_classify_alg alg)
+{
+	if (alg == RTE_ACL_CLASSIFY_S390X) {
+#if defined(RTE_ARCH_S390X)
+		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+			return 0;
+#endif
+		return -ENOTSUP;
+	}
+
+	return -EINVAL;
+}
+
+
 #ifdef CC_AVX512_SUPPORT
 static int
 acl_check_avx512_cpu_flags(void)
@@ -216,6 +239,8 @@  acl_check_alg(enum rte_acl_classify_alg alg)
 		return acl_check_alg_arm(alg);
 	case RTE_ACL_CLASSIFY_ALTIVEC:
 		return acl_check_alg_ppc(alg);
+	case RTE_ACL_CLASSIFY_S390X:
+		return acl_check_alg_s390x(alg);
 	case RTE_ACL_CLASSIFY_AVX512X32:
 	case RTE_ACL_CLASSIFY_AVX512X16:
 	case RTE_ACL_CLASSIFY_AVX2:
@@ -244,6 +269,8 @@  acl_get_best_alg(void)
 		RTE_ACL_CLASSIFY_NEON,
 #elif defined(RTE_ARCH_PPC_64)
 		RTE_ACL_CLASSIFY_ALTIVEC,
+#elif defined(RTE_ARCH_S390X)
+		RTE_ACL_CLASSIFY_S390X,
 #elif defined(RTE_ARCH_X86)
 		RTE_ACL_CLASSIFY_AVX512X32,
 		RTE_ACL_CLASSIFY_AVX512X16,
diff --git a/lib/acl/rte_acl.h b/lib/acl/rte_acl.h
index ca75a6f220..630e057de7 100644
--- a/lib/acl/rte_acl.h
+++ b/lib/acl/rte_acl.h
@@ -242,8 +242,9 @@  enum rte_acl_classify_alg {
 	RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
 	RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
 	RTE_ACL_CLASSIFY_ALTIVEC = 5,    /**< requires ALTIVEC support. */
-	RTE_ACL_CLASSIFY_AVX512X16 = 6,  /**< requires AVX512 support. */
-	RTE_ACL_CLASSIFY_AVX512X32 = 7,  /**< requires AVX512 support. */
+	RTE_ACL_CLASSIFY_S390X = 6,    /**< requires s390x z13 support. */
+	RTE_ACL_CLASSIFY_AVX512X16 = 7,  /**< requires AVX512 support. */
+	RTE_ACL_CLASSIFY_AVX512X32 = 8,  /**< requires AVX512 support. */
 };
 
 /**
diff --git a/lib/eal/s390x/include/meson.build b/lib/eal/s390x/include/meson.build
new file mode 100644
index 0000000000..b4561d6a82
--- /dev/null
+++ b/lib/eal/s390x/include/meson.build
@@ -0,0 +1,16 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2018, 2019
+
+install_headers(
+	'rte_atomic.h',
+	'rte_byteorder.h',
+	'rte_cpuflags.h',
+	'rte_cycles.h',
+	'rte_io.h',
+	'rte_memcpy.h',
+	'rte_pause.h',
+	'rte_prefetch.h',
+	'rte_rwlock.h',
+	'rte_spinlock.h',
+	'rte_vect.h',
+	subdir: get_option('include_subdir_arch'))
diff --git a/lib/eal/s390x/include/rte_atomic.h b/lib/eal/s390x/include/rte_atomic.h
new file mode 100644
index 0000000000..3031cefa4e
--- /dev/null
+++ b/lib/eal/s390x/include/rte_atomic.h
@@ -0,0 +1,44 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_ATOMIC_S390X_H_
+#define _RTE_ATOMIC_S390X_H_
+
+#ifndef RTE_FORCE_INTRINSICS
+#  error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_atomic.h"
+
+#define rte_mb() rte_compiler_barrier()
+
+#define rte_wmb() rte_mb()
+
+#define rte_rmb() rte_mb()
+
+#define rte_smp_mb() rte_mb()
+
+#define rte_smp_wmb() rte_wmb()
+
+#define rte_smp_rmb() rte_rmb()
+
+#define rte_io_mb() rte_mb()
+
+#define rte_io_wmb() rte_wmb()
+
+#define rte_io_rmb() rte_rmb()
+
+#define rte_cio_wmb() rte_wmb()
+
+#define rte_cio_rmb() rte_rmb()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ATOMIC_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_byteorder.h b/lib/eal/s390x/include/rte_byteorder.h
new file mode 100644
index 0000000000..de6e410b4b
--- /dev/null
+++ b/lib/eal/s390x/include/rte_byteorder.h
@@ -0,0 +1,43 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+/* Inspired from FreeBSD src/sys/powerpc/include/endian.h
+ * Copyright (c) 1987, 1991, 1993
+ * The Regents of the University of California.  All rights reserved.
+ */
+
+#ifndef _RTE_BYTEORDER_S390X_H_
+#define _RTE_BYTEORDER_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "generic/rte_byteorder.h"
+
+/* s390x is big endian
+ */
+
+#define rte_cpu_to_le_16(x) rte_bswap16(x)
+#define rte_cpu_to_le_32(x) rte_bswap32(x)
+#define rte_cpu_to_le_64(x) rte_bswap64(x)
+
+#define rte_cpu_to_be_16(x) (x)
+#define rte_cpu_to_be_32(x) (x)
+#define rte_cpu_to_be_64(x) (x)
+
+#define rte_le_to_cpu_16(x) rte_bswap16(x)
+#define rte_le_to_cpu_32(x) rte_bswap32(x)
+#define rte_le_to_cpu_64(x) rte_bswap64(x)
+
+#define rte_be_to_cpu_16(x) (x)
+#define rte_be_to_cpu_32(x) (x)
+#define rte_be_to_cpu_64(x) (x)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BYTEORDER_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_cpuflags.h b/lib/eal/s390x/include/rte_cpuflags.h
new file mode 100644
index 0000000000..52136bea87
--- /dev/null
+++ b/lib/eal/s390x/include/rte_cpuflags.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_CPUFLAGS_S390X_H_
+#define _RTE_CPUFLAGS_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Enumeration of all CPU features supported
+ */
+enum rte_cpu_flag_t {
+	RTE_CPUFLAG_ESAN3 = 0,
+	RTE_CPUFLAG_ZARCH,
+	RTE_CPUFLAG_STFLE,
+	RTE_CPUFLAG_MSA,
+	RTE_CPUFLAG_LDISP,
+	RTE_CPUFLAG_EIMM,
+	RTE_CPUFLAG_DFP,
+	RTE_CPUFLAG_HPAGE,
+	RTE_CPUFLAG_ETF3EH,
+	RTE_CPUFLAG_HIGH_GPRS,
+	RTE_CPUFLAG_TE,
+	RTE_CPUFLAG_VXRS,
+	RTE_CPUFLAG_VXRS_BCD,
+	RTE_CPUFLAG_VXRS_EXT,
+	RTE_CPUFLAG_GS,
+	/* The last item */
+	RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */
+};
+
+#include "generic/rte_cpuflags.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_CPUFLAGS_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_cycles.h b/lib/eal/s390x/include/rte_cycles.h
new file mode 100644
index 0000000000..7a430e06a8
--- /dev/null
+++ b/lib/eal/s390x/include/rte_cycles.h
@@ -0,0 +1,44 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_CYCLES_S390X_H_
+#define _RTE_CYCLES_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_cycles.h"
+
+#include <rte_common.h>
+
+/**
+ * Read the time base register.
+ *
+ * @return
+ *   The time base for this lcore.
+ */
+static inline uint64_t
+rte_rdtsc(void)
+{
+	uint64_t tsc;
+	asm volatile("stckf %0" : "=Q"(tsc) : : "cc");
+	return tsc;
+}
+
+static inline uint64_t
+rte_rdtsc_precise(void)
+{
+	rte_mb();
+	return rte_rdtsc();
+}
+
+static inline uint64_t
+rte_get_tsc_cycles(void) { return rte_rdtsc(); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_CYCLES_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_io.h b/lib/eal/s390x/include/rte_io.h
new file mode 100644
index 0000000000..9d18e65cab
--- /dev/null
+++ b/lib/eal/s390x/include/rte_io.h
@@ -0,0 +1,184 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_IO_S390X_H_
+#define _RTE_IO_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#define RTE_OVERRIDE_IO_H
+
+#include "generic/rte_io.h"
+
+#include <unistd.h>
+#include <sys/syscall.h>
+
+union register_pair {
+	__int128_t pair;
+	struct {
+		unsigned long even;
+		unsigned long odd;
+	} even_odd;
+};
+
+/* s390 requires special instructions to access IO memory. */
+static inline uint64_t pcilgi(const volatile void *ioaddr, size_t len)
+{
+	int cc = -1;
+	uint64_t val;
+	union register_pair ioaddr_len = {
+		.even_odd.even = (uint64_t)ioaddr, .even_odd.odd = len
+	};
+	asm volatile (
+		"       .insn   rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+		"       ipm     %[cc]\n"
+		"       srl     %[cc],28\n"
+		: [cc] "+d" (cc), [val] "=d" (val),
+		  [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc");
+	return val;
+}
+
+static inline void pcistgi(volatile void *ioaddr, uint64_t val, size_t len)
+{
+	int cc = -1;
+	union register_pair ioaddr_len = {
+		.even_odd.even = (uint64_t)ioaddr, .even_odd.odd = len
+	};
+	asm volatile (
+		"       .insn   rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+		"       ipm     %[cc]\n"
+		"       srl     %[cc],28\n"
+		: [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+		: [val] "d" (val)
+		: "cc", "memory");
+}
+
+/* fall back to syscall on old machines ? */
+static __rte_always_inline uint8_t
+rte_read8_relaxed(const volatile void *addr)
+{
+	return pcilgi(addr, 1);
+}
+
+static __rte_always_inline uint16_t
+rte_read16_relaxed(const volatile void *addr)
+{
+	return pcilgi(addr, 2);
+}
+
+static __rte_always_inline uint32_t
+rte_read32_relaxed(const volatile void *addr)
+{
+	return pcilgi(addr, 4);
+}
+
+static __rte_always_inline uint64_t
+rte_read64_relaxed(const volatile void *addr)
+{
+	return pcilgi(addr, 8);
+}
+
+static __rte_always_inline void
+rte_write8_relaxed(uint8_t value, volatile void *addr)
+{
+	pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write16_relaxed(uint16_t value, volatile void *addr)
+{
+	pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write32_relaxed(uint32_t value, volatile void *addr)
+{
+	pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write64_relaxed(uint64_t value, volatile void *addr)
+{
+	pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline uint8_t
+rte_read8(const volatile void *addr)
+{
+	uint8_t val;
+	val = rte_read8_relaxed(addr);
+	rte_io_rmb();
+	return val;
+}
+
+static __rte_always_inline uint16_t
+rte_read16(const volatile void *addr)
+{
+	uint16_t val;
+	val = rte_read16_relaxed(addr);
+	rte_io_rmb();
+	return val;
+}
+
+static __rte_always_inline uint32_t
+rte_read32(const volatile void *addr)
+{
+	uint32_t val;
+	val = rte_read32_relaxed(addr);
+	rte_io_rmb();
+	return val;
+}
+
+static __rte_always_inline uint64_t
+rte_read64(const volatile void *addr)
+{
+	uint64_t val;
+	val = rte_read64_relaxed(addr);
+	rte_io_rmb();
+	return val;
+}
+
+static __rte_always_inline void
+rte_write8(uint8_t value, volatile void *addr)
+{
+	rte_io_wmb();
+	rte_write8_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write16(uint16_t value, volatile void *addr)
+{
+	rte_io_wmb();
+	rte_write16_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32(uint32_t value, volatile void *addr)
+{
+	rte_io_wmb();
+	rte_write32_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write64(uint64_t value, volatile void *addr)
+{
+	rte_io_wmb();
+	rte_write64_relaxed(value, addr);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_IO_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_mcslock.h b/lib/eal/s390x/include/rte_mcslock.h
new file mode 100644
index 0000000000..9125237dfd
--- /dev/null
+++ b/lib/eal/s390x/include/rte_mcslock.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_MCSLOCK_S390X_H_
+#define _RTE_MCSLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_mcslock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MCSLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_memcpy.h b/lib/eal/s390x/include/rte_memcpy.h
new file mode 100644
index 0000000000..1135b1af6f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_memcpy.h
@@ -0,0 +1,55 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_MEMCPY_S390X_H_
+#define _RTE_MEMCPY_S390X_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+#define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_pause.h b/lib/eal/s390x/include/rte_pause.h
new file mode 100644
index 0000000000..be90ce6a1f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_pause.h
@@ -0,0 +1,22 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_PAUSE_S390X_H_
+#define _RTE_PAUSE_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pause.h"
+
+static inline void rte_pause(void)
+{
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PAUSE_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_power_intrinsics.h b/lib/eal/s390x/include/rte_power_intrinsics.h
new file mode 100644
index 0000000000..c0e9ac279f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_power_intrinsics.h
@@ -0,0 +1,20 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_INTRINSIC_PPC_H_
+#define _RTE_POWER_INTRINSIC_PPC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+
+#include "generic/rte_power_intrinsics.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_POWER_INTRINSIC_PPC_H_ */
diff --git a/lib/eal/s390x/include/rte_prefetch.h b/lib/eal/s390x/include/rte_prefetch.h
new file mode 100644
index 0000000000..968b01cb70
--- /dev/null
+++ b/lib/eal/s390x/include/rte_prefetch.h
@@ -0,0 +1,46 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_PREFETCH_S390X_H_
+#define _RTE_PREFETCH_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include "generic/rte_prefetch.h"
+
+static inline void rte_prefetch0(const volatile void *p)
+{
+	asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch1(const volatile void *p)
+{
+	asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch2(const volatile void *p)
+{
+	asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch_non_temporal(const volatile void *p)
+{
+	/* non-temporal version not available, fallback to rte_prefetch0 */
+	rte_prefetch0(p);
+}
+
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PREFETCH_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_rwlock.h b/lib/eal/s390x/include/rte_rwlock.h
new file mode 100644
index 0000000000..f649484f35
--- /dev/null
+++ b/lib/eal/s390x/include/rte_rwlock.h
@@ -0,0 +1,42 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_RWLOCK_S390X_H_
+#define _RTE_RWLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_rwlock.h"
+
+static inline void
+rte_rwlock_read_lock_tm(rte_rwlock_t *rwl)
+{
+	rte_rwlock_read_lock(rwl);
+}
+
+static inline void
+rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl)
+{
+	rte_rwlock_read_unlock(rwl);
+}
+
+static inline void
+rte_rwlock_write_lock_tm(rte_rwlock_t *rwl)
+{
+	rte_rwlock_write_lock(rwl);
+}
+
+static inline void
+rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl)
+{
+	rte_rwlock_write_unlock(rwl);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_RWLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_spinlock.h b/lib/eal/s390x/include/rte_spinlock.h
new file mode 100644
index 0000000000..0434864fbc
--- /dev/null
+++ b/lib/eal/s390x/include/rte_spinlock.h
@@ -0,0 +1,85 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_SPINLOCK_S390X_H_
+#define _RTE_SPINLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include "generic/rte_spinlock.h"
+
+#ifndef RTE_FORCE_INTRINSICS
+
+static inline void
+rte_spinlock_lock(rte_spinlock_t *sl)
+{
+	while (__sync_lock_test_and_set(&sl->locked, 1))
+		while (sl->locked)
+			rte_pause();
+}
+
+static inline void
+rte_spinlock_unlock(rte_spinlock_t *sl)
+{
+	__sync_lock_release(&sl->locked);
+}
+
+static inline int
+rte_spinlock_trylock(rte_spinlock_t *sl)
+{
+	return __sync_lock_test_and_set(&sl->locked, 1) == 0;
+}
+
+#endif
+
+
+static inline int rte_tm_supported(void)
+{
+	return 0;
+}
+
+static inline void
+rte_spinlock_lock_tm(rte_spinlock_t *sl)
+{
+	rte_spinlock_lock(sl); /* fall-back */
+}
+
+static inline int
+rte_spinlock_trylock_tm(rte_spinlock_t *sl)
+{
+	return rte_spinlock_trylock(sl);
+}
+
+static inline void
+rte_spinlock_unlock_tm(rte_spinlock_t *sl)
+{
+	rte_spinlock_unlock(sl);
+}
+
+static inline void
+rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr)
+{
+	rte_spinlock_recursive_lock(slr); /* fall-back */
+}
+
+static inline void
+rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr)
+{
+	rte_spinlock_recursive_unlock(slr);
+}
+
+static inline int
+rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr)
+{
+	return rte_spinlock_recursive_trylock(slr);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_SPINLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_ticketlock.h b/lib/eal/s390x/include/rte_ticketlock.h
new file mode 100644
index 0000000000..0785363c94
--- /dev/null
+++ b/lib/eal/s390x/include/rte_ticketlock.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2019
+ */
+
+#ifndef _RTE_TICKETLOCK_S390X_H_
+#define _RTE_TICKETLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_ticketlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_TICKETLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_vect.h b/lib/eal/s390x/include/rte_vect.h
new file mode 100644
index 0000000000..8f930755be
--- /dev/null
+++ b/lib/eal/s390x/include/rte_vect.h
@@ -0,0 +1,35 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_VECT_S390X_H_
+#define _RTE_VECT_S390X_H_
+
+#include <vecintrin.h>
+#include "generic/rte_vect.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256
+
+typedef int xmm_t __attribute__((vector_size(4*sizeof(int))));
+
+#define	XMM_SIZE	(sizeof(xmm_t))
+#define	XMM_MASK	(XMM_SIZE - 1)
+
+typedef union rte_xmm {
+	xmm_t    x;
+	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+	double   pd[XMM_SIZE / sizeof(double)];
+} __aligned(16) rte_xmm_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_VECT_S390X_H_ */
diff --git a/lib/eal/s390x/meson.build b/lib/eal/s390x/meson.build
new file mode 100644
index 0000000000..c8cc8d1f3d
--- /dev/null
+++ b/lib/eal/s390x/meson.build
@@ -0,0 +1,16 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2018, 2019
+
+subdir('include')
+
+# 19.xx zarch patches lib/librte_eal/common/arch/s390x/meson.build:
+# var was: eal_common_arch_sources
+#
+sources += files(
+        'rte_cpuflags.c',
+        'rte_cycles.c',
+        'rte_hypervisor.c',
+        'rte_power_intrinsics.c',
+)
+
+
diff --git a/lib/eal/s390x/rte_cpuflags.c b/lib/eal/s390x/rte_cpuflags.c
new file mode 100644
index 0000000000..d57a51d267
--- /dev/null
+++ b/lib/eal/s390x/rte_cpuflags.c
@@ -0,0 +1,91 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "rte_cpuflags.h"
+
+#include <elf.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+
+/* Symbolic values for the entries in the auxiliary table */
+#define AT_HWCAP  16
+#define AT_HWCAP2 26
+
+/* software based registers */
+enum cpu_register_t {
+	REG_NONE = 0,
+	REG_HWCAP,
+	REG_HWCAP2,
+	REG_MAX
+};
+
+typedef uint32_t hwcap_registers_t[REG_MAX];
+
+struct feature_entry {
+	uint32_t reg;
+	uint32_t bit;
+#define CPU_FLAG_NAME_MAX_LEN 64
+	char name[CPU_FLAG_NAME_MAX_LEN];
+};
+
+#define FEAT_DEF(name, reg, bit) \
+	[RTE_CPUFLAG_##name] = {reg, bit, #name},
+
+const struct feature_entry rte_cpu_feature_table[] = {
+	FEAT_DEF(ESAN3,                  REG_HWCAP,   0)
+	FEAT_DEF(ZARCH,                  REG_HWCAP,   1)
+	FEAT_DEF(STFLE,                  REG_HWCAP,   2)
+	FEAT_DEF(MSA,                    REG_HWCAP,   3)
+	FEAT_DEF(LDISP,                  REG_HWCAP,   4)
+	FEAT_DEF(EIMM,                   REG_HWCAP,   5)
+	FEAT_DEF(DFP,                    REG_HWCAP,   6)
+	FEAT_DEF(HPAGE,                  REG_HWCAP,   7)
+	FEAT_DEF(ETF3EH,                 REG_HWCAP,   8)
+	FEAT_DEF(HIGH_GPRS,              REG_HWCAP,   9)
+	FEAT_DEF(TE,                     REG_HWCAP,  10)
+	FEAT_DEF(VXRS,                   REG_HWCAP,  11)
+	FEAT_DEF(VXRS_BCD,               REG_HWCAP,  12)
+	FEAT_DEF(VXRS_EXT,               REG_HWCAP,  13)
+	FEAT_DEF(GS,                     REG_HWCAP,  14)
+};
+
+/*
+ * Read AUXV software register and get cpu features for Power
+ */
+static void
+rte_cpu_get_features(hwcap_registers_t out)
+{
+	out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP);
+	out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2);
+}
+
+/*
+ * Checks if a particular flag is available on current machine.
+ */
+int
+rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature)
+{
+	const struct feature_entry *feat;
+	hwcap_registers_t regs = {0};
+
+	if (feature >= RTE_CPUFLAG_NUMFLAGS)
+		return -ENOENT;
+
+	feat = &rte_cpu_feature_table[feature];
+	if (feat->reg == REG_NONE)
+		return -EFAULT;
+
+	rte_cpu_get_features(regs);
+	return (regs[feat->reg] >> feat->bit) & 1;
+}
+
+const char *
+rte_cpu_get_flag_name(enum rte_cpu_flag_t feature)
+{
+	if (feature >= RTE_CPUFLAG_NUMFLAGS)
+		return NULL;
+	return rte_cpu_feature_table[feature].name;
+}
diff --git a/lib/eal/s390x/rte_cycles.c b/lib/eal/s390x/rte_cycles.c
new file mode 100644
index 0000000000..b29c4454a1
--- /dev/null
+++ b/lib/eal/s390x/rte_cycles.c
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "eal_private.h"
+
+uint64_t
+get_tsc_freq_arch(void)
+{
+	return 0;
+}
diff --git a/lib/eal/s390x/rte_hypervisor.c b/lib/eal/s390x/rte_hypervisor.c
new file mode 100644
index 0000000000..22b0c5cc47
--- /dev/null
+++ b/lib/eal/s390x/rte_hypervisor.c
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "rte_hypervisor.h"
+
+enum rte_hypervisor
+rte_hypervisor_get(void)
+{
+	return RTE_HYPERVISOR_UNKNOWN;
+}
diff --git a/lib/eal/s390x/rte_power_intrinsics.c b/lib/eal/s390x/rte_power_intrinsics.c
new file mode 100644
index 0000000000..f00b58ade5
--- /dev/null
+++ b/lib/eal/s390x/rte_power_intrinsics.c
@@ -0,0 +1,51 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include "rte_power_intrinsics.h"
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_monitor(const struct rte_power_monitor_cond *pmc,
+		const uint64_t tsc_timestamp)
+{
+	RTE_SET_USED(pmc);
+	RTE_SET_USED(tsc_timestamp);
+
+	return -ENOTSUP;
+}
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_pause(const uint64_t tsc_timestamp)
+{
+	RTE_SET_USED(tsc_timestamp);
+
+	return -ENOTSUP;
+}
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+	RTE_SET_USED(lcore_id);
+
+	return -ENOTSUP;
+}
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+		const uint32_t num, const uint64_t tsc_timestamp)
+{
+	RTE_SET_USED(pmc);
+	RTE_SET_USED(num);
+	RTE_SET_USED(tsc_timestamp);
+
+	return -ENOTSUP;
+}
diff --git a/lib/hash/rte_fbk_hash.h b/lib/hash/rte_fbk_hash.h
index b01126999b..956d3f90f9 100644
--- a/lib/hash/rte_fbk_hash.h
+++ b/lib/hash/rte_fbk_hash.h
@@ -123,9 +123,16 @@  rte_fbk_hash_add_key_with_bucket(struct rte_fbk_hash_table *ht,
 	 * corrupted due to race conditions, but it's still possible to
 	 * overwrite entries that have just been made valid.
 	 */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 	const uint64_t new_entry = ((uint64_t)(key) << 32) |
 			((uint64_t)(value) << 16) |
 			1;  /* 1 = is_entry bit. */
+	#else
+	const uint64_t new_entry =
+			((uint64_t)(1) << 48) | /* 1 = is_entry bit. */
+			((uint64_t)(value) << 32) |
+			(uint64_t)(key);
+	#endif
 	uint32_t i;
 
 	for (i = 0; i < ht->entries_per_bucket; i++) {
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index 4cd48886fc..47400079bb 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -7,6 +7,7 @@  headers = files('rte_lpm.h', 'rte_lpm6.h')
 # without worrying about which architecture we actually need
 indirect_headers += files(
         'rte_lpm_altivec.h',
+        'rte_lpm_s390x.h',
         'rte_lpm_neon.h',
         'rte_lpm_scalar.h',
         'rte_lpm_sse.h',
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 75e27ff164..9991842d5f 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -407,6 +407,8 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #include "rte_lpm_neon.h"
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
+#elif defined(RTE_ARCH_S390X)
+#include "rte_lpm_s390x.h"
 #elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
 #else
diff --git a/lib/lpm/rte_lpm6.c b/lib/lpm/rte_lpm6.c
index 8d21aeddb8..4a0f5740a2 100644
--- a/lib/lpm/rte_lpm6.c
+++ b/lib/lpm/rte_lpm6.c
@@ -18,6 +18,7 @@ 
 #include <assert.h>
 #include <rte_jhash.h>
 #include <rte_tailq.h>
+#include <rte_byteorder.h>
 
 #include "rte_lpm6.h"
 
@@ -52,6 +53,8 @@  static struct rte_tailq_elem rte_lpm6_tailq = {
 };
 EAL_REGISTER_TAILQ(rte_lpm6_tailq)
 
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+
 /** Tbl entry structure. It is the same for both tbl24 and tbl8 */
 struct rte_lpm6_tbl_entry {
 	uint32_t next_hop:	21;  /**< Next hop / next table to be checked. */
@@ -63,6 +66,21 @@  struct rte_lpm6_tbl_entry {
 	uint32_t ext_entry :1;   /**< External entry. */
 };
 
+#else
+
+struct rte_lpm6_tbl_entry {
+
+	/* Flags. */
+	uint32_t ext_entry :1;   /**< External entry. */
+	uint32_t valid_group :1; /**< Group validation flag. */
+	uint32_t valid     :1;   /**< Validation flag. */
+
+	uint32_t depth	:8;      /**< Rule depth. */
+	uint32_t next_hop:	21;  /**< Next hop / next table to be checked. */
+};
+
+#endif
+
 /** Rules tbl entry structure. */
 struct rte_lpm6_rule {
 	uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */
diff --git a/lib/lpm/rte_lpm_s390x.h b/lib/lpm/rte_lpm_s390x.h
new file mode 100644
index 0000000000..eb1fdd4509
--- /dev/null
+++ b/lib/lpm/rte_lpm_s390x.h
@@ -0,0 +1,130 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2016, 2018
+ */
+
+#ifndef _RTE_LPM_S390X_H_
+#define _RTE_LPM_S390X_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv)
+{
+	typedef int vector_signed_int
+		__attribute__((vector_size(4*sizeof(int))));
+	vector_signed_int i24;
+	rte_xmm_t i8;
+	uint32_t tbl[4];
+	uint64_t idx, pt, pt2;
+	const uint32_t *ptbl;
+
+	const uint32_t mask = UINT8_MAX;
+	const vector_signed_int mask8 = (xmm_t){mask, mask, mask, mask};
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+	 * as one 64-bit value (0x0300000003000000).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+	 * as one 64-bit value (0x0100000001000000).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+	/* get 4 indexes for tbl24[]. */
+	i24[0] = (uint32_t)ip[0] >> 8;
+	i24[1] = (uint32_t)ip[1] >> 8;
+	i24[2] = (uint32_t)ip[2] >> 8;
+	i24[3] = (uint32_t)ip[3] >> 8;
+
+	/* extract values from tbl24[] */
+	idx = (uint32_t)i24[0];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[0] = *ptbl;
+
+	idx = (uint32_t) i24[1];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[1] = *ptbl;
+
+	idx = (uint32_t) i24[2];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[2] = *ptbl;
+
+	idx = (uint32_t) i24[3];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[3] = *ptbl;
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = vec_and(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 32;
+	pt2 = (uint64_t)tbl[2] |
+		(uint64_t)tbl[3] << 32;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v) &&
+			likely((pt2 & mask_xv) == mask_v)) {
+		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+		tbl[0] = *ptbl;
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+		tbl[1] = *ptbl;
+	}
+	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+		tbl[2] = *ptbl;
+	}
+	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+		tbl[3] = *ptbl;
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv;
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv;
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv;
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_S390X_H_ */
diff --git a/meson.build b/meson.build
index 39cb73846d..7fe6d7bf75 100644
--- a/meson.build
+++ b/meson.build
@@ -58,6 +58,8 @@  elif host_machine.cpu_family().startswith('loongarch')
     arch_subdir = 'loongarch'
 elif host_machine.cpu_family().startswith('ppc')
     arch_subdir = 'ppc'
+elif host_machine.cpu_family().startswith('s390x')
+    arch_subdir = 's390x'
 elif host_machine.cpu_family().startswith('riscv')
     arch_subdir = 'riscv'
 endif