@@ -36,9 +36,6 @@ TAGS
# ignore python bytecode files
*.pyc
-# ignore BPF programs
-drivers/net/tap/bpf/tap_bpf_program.o
-
# DTS results
dts/output
deleted file mode 100644
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# This file is not built as part of normal DPDK build.
-# It is used to generate the eBPF code for TAP RSS.
-
-CLANG=clang
-CLANG_OPTS=-O2
-TARGET=../tap_bpf_insns.h
-
-all: $(TARGET)
-
-clean:
- rm tap_bpf_program.o $(TARGET)
-
-tap_bpf_program.o: tap_bpf_program.c
- $(CLANG) $(CLANG_OPTS) -emit-llvm -c $< -o - | \
- llc -march=bpf -filetype=obj -o $@
-
-$(TARGET): tap_bpf_program.o
- python3 bpf_extract.py -stap_bpf_program.c -o $@ $<
new file mode 100644
@@ -0,0 +1,12 @@
+This is the BPF program used to implement the RSS across queues
+flow action. It works like the skbedit tc filter but instead of mapping
+to only one queues, it maps to multiple queues based on RSS hash.
+
+This version is built the BPF Compile Once — Run Everywhere (CO-RE)
+framework and uses libbpf and bpftool.
+
+Limitations
+- requires libbpf version XX or later
+- rebuilding the BPF requires clang and bpftool
+- only Toeplitz hash with standard 40 byte key is supported
+- the number of queues per RSS action is limited to 16
deleted file mode 100644
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
-
-#ifndef __BPF_API__
-#define __BPF_API__
-
-/* Note:
- *
- * This file can be included into eBPF kernel programs. It contains
- * a couple of useful helper functions, map/section ABI (bpf_elf.h),
- * misc macros and some eBPF specific LLVM built-ins.
- */
-
-#include <stdint.h>
-
-#include <linux/pkt_cls.h>
-#include <linux/bpf.h>
-#include <linux/filter.h>
-
-#include <asm/byteorder.h>
-
-#include "bpf_elf.h"
-
-/** libbpf pin type. */
-enum libbpf_pin_type {
- LIBBPF_PIN_NONE,
- /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
- LIBBPF_PIN_BY_NAME,
-};
-
-/** Type helper macros. */
-
-#define __uint(name, val) int (*name)[val]
-#define __type(name, val) typeof(val) *name
-#define __array(name, val) typeof(val) *name[]
-
-/** Misc macros. */
-
-#ifndef __stringify
-# define __stringify(X) #X
-#endif
-
-#ifndef __maybe_unused
-# define __maybe_unused __attribute__((__unused__))
-#endif
-
-#ifndef offsetof
-# define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER)
-#endif
-
-#ifndef likely
-# define likely(X) __builtin_expect(!!(X), 1)
-#endif
-
-#ifndef unlikely
-# define unlikely(X) __builtin_expect(!!(X), 0)
-#endif
-
-#ifndef htons
-# define htons(X) __constant_htons((X))
-#endif
-
-#ifndef ntohs
-# define ntohs(X) __constant_ntohs((X))
-#endif
-
-#ifndef htonl
-# define htonl(X) __constant_htonl((X))
-#endif
-
-#ifndef ntohl
-# define ntohl(X) __constant_ntohl((X))
-#endif
-
-#ifndef __inline__
-# define __inline__ __attribute__((always_inline))
-#endif
-
-/** Section helper macros. */
-
-#ifndef __section
-# define __section(NAME) \
- __attribute__((section(NAME), used))
-#endif
-
-#ifndef __section_tail
-# define __section_tail(ID, KEY) \
- __section(__stringify(ID) "/" __stringify(KEY))
-#endif
-
-#ifndef __section_xdp_entry
-# define __section_xdp_entry \
- __section(ELF_SECTION_PROG)
-#endif
-
-#ifndef __section_cls_entry
-# define __section_cls_entry \
- __section(ELF_SECTION_CLASSIFIER)
-#endif
-
-#ifndef __section_act_entry
-# define __section_act_entry \
- __section(ELF_SECTION_ACTION)
-#endif
-
-#ifndef __section_lwt_entry
-# define __section_lwt_entry \
- __section(ELF_SECTION_PROG)
-#endif
-
-#ifndef __section_license
-# define __section_license \
- __section(ELF_SECTION_LICENSE)
-#endif
-
-#ifndef __section_maps
-# define __section_maps \
- __section(ELF_SECTION_MAPS)
-#endif
-
-/** Declaration helper macros. */
-
-#ifndef BPF_LICENSE
-# define BPF_LICENSE(NAME) \
- char ____license[] __section_license = NAME
-#endif
-
-/** Classifier helper */
-
-#ifndef BPF_H_DEFAULT
-# define BPF_H_DEFAULT -1
-#endif
-
-/** BPF helper functions for tc. Individual flags are in linux/bpf.h */
-
-#ifndef __BPF_FUNC
-# define __BPF_FUNC(NAME, ...) \
- (* NAME)(__VA_ARGS__) __maybe_unused
-#endif
-
-#ifndef BPF_FUNC
-# define BPF_FUNC(NAME, ...) \
- __BPF_FUNC(NAME, __VA_ARGS__) = (void *) BPF_FUNC_##NAME
-#endif
-
-/* Map access/manipulation */
-static void *BPF_FUNC(map_lookup_elem, void *map, const void *key);
-static int BPF_FUNC(map_update_elem, void *map, const void *key,
- const void *value, uint32_t flags);
-static int BPF_FUNC(map_delete_elem, void *map, const void *key);
-
-/* Time access */
-static uint64_t BPF_FUNC(ktime_get_ns);
-
-/* Debugging */
-
-/* FIXME: __attribute__ ((format(printf, 1, 3))) not possible unless
- * llvm bug https://llvm.org/bugs/show_bug.cgi?id=26243 gets resolved.
- * It would require ____fmt to be made const, which generates a reloc
- * entry (non-map).
- */
-static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...);
-
-#ifndef printt
-# define printt(fmt, ...) \
- ({ \
- char ____fmt[] = fmt; \
- trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__); \
- })
-#endif
-
-/* Random numbers */
-static uint32_t BPF_FUNC(get_prandom_u32);
-
-/* Tail calls */
-static void BPF_FUNC(tail_call, struct __sk_buff *skb, void *map,
- uint32_t index);
-
-/* System helpers */
-static uint32_t BPF_FUNC(get_smp_processor_id);
-static uint32_t BPF_FUNC(get_numa_node_id);
-
-/* Packet misc meta data */
-static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb);
-static int BPF_FUNC(skb_under_cgroup, void *map, uint32_t index);
-
-static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb);
-static uint32_t BPF_FUNC(get_hash_recalc, struct __sk_buff *skb);
-static uint32_t BPF_FUNC(set_hash_invalid, struct __sk_buff *skb);
-
-/* Packet redirection */
-static int BPF_FUNC(redirect, int ifindex, uint32_t flags);
-static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex,
- uint32_t flags);
-
-/* Packet manipulation */
-static int BPF_FUNC(skb_load_bytes, struct __sk_buff *skb, uint32_t off,
- void *to, uint32_t len);
-static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off,
- const void *from, uint32_t len, uint32_t flags);
-
-static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off,
- uint32_t from, uint32_t to, uint32_t flags);
-static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off,
- uint32_t from, uint32_t to, uint32_t flags);
-static int BPF_FUNC(csum_diff, const void *from, uint32_t from_size,
- const void *to, uint32_t to_size, uint32_t seed);
-static int BPF_FUNC(csum_update, struct __sk_buff *skb, uint32_t wsum);
-
-static int BPF_FUNC(skb_change_type, struct __sk_buff *skb, uint32_t type);
-static int BPF_FUNC(skb_change_proto, struct __sk_buff *skb, uint32_t proto,
- uint32_t flags);
-static int BPF_FUNC(skb_change_tail, struct __sk_buff *skb, uint32_t nlen,
- uint32_t flags);
-
-static int BPF_FUNC(skb_pull_data, struct __sk_buff *skb, uint32_t len);
-
-/* Event notification */
-static int __BPF_FUNC(skb_event_output, struct __sk_buff *skb, void *map,
- uint64_t index, const void *data, uint32_t size) =
- (void *) BPF_FUNC_perf_event_output;
-
-/* Packet vlan encap/decap */
-static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto,
- uint16_t vlan_tci);
-static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb);
-
-/* Packet tunnel encap/decap */
-static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb,
- struct bpf_tunnel_key *to, uint32_t size, uint32_t flags);
-static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb,
- const struct bpf_tunnel_key *from, uint32_t size,
- uint32_t flags);
-
-static int BPF_FUNC(skb_get_tunnel_opt, struct __sk_buff *skb,
- void *to, uint32_t size);
-static int BPF_FUNC(skb_set_tunnel_opt, struct __sk_buff *skb,
- const void *from, uint32_t size);
-
-/** LLVM built-ins, mem*() routines work for constant size */
-
-#ifndef lock_xadd
-# define lock_xadd(ptr, val) ((void) __sync_fetch_and_add(ptr, val))
-#endif
-
-#ifndef memset
-# define memset(s, c, n) __builtin_memset((s), (c), (n))
-#endif
-
-#ifndef memcpy
-# define memcpy(d, s, n) __builtin_memcpy((d), (s), (n))
-#endif
-
-#ifndef memmove
-# define memmove(d, s, n) __builtin_memmove((d), (s), (n))
-#endif
-
-/* FIXME: __builtin_memcmp() is not yet fully usable unless llvm bug
- * https://llvm.org/bugs/show_bug.cgi?id=26218 gets resolved. Also
- * this one would generate a reloc entry (non-map), otherwise.
- */
-#if 0
-#ifndef memcmp
-# define memcmp(a, b, n) __builtin_memcmp((a), (b), (n))
-#endif
-#endif
-
-unsigned long long load_byte(void *skb, unsigned long long off)
- asm ("llvm.bpf.load.byte");
-
-unsigned long long load_half(void *skb, unsigned long long off)
- asm ("llvm.bpf.load.half");
-
-unsigned long long load_word(void *skb, unsigned long long off)
- asm ("llvm.bpf.load.word");
-
-#endif /* __BPF_API__ */
deleted file mode 100644
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
-#ifndef __BPF_ELF__
-#define __BPF_ELF__
-
-#include <asm/types.h>
-
-/* Note:
- *
- * Below ELF section names and bpf_elf_map structure definition
- * are not (!) kernel ABI. It's rather a "contract" between the
- * application and the BPF loader in tc. For compatibility, the
- * section names should stay as-is. Introduction of aliases, if
- * needed, are a possibility, though.
- */
-
-/* ELF section names, etc */
-#define ELF_SECTION_LICENSE "license"
-#define ELF_SECTION_MAPS "maps"
-#define ELF_SECTION_PROG "prog"
-#define ELF_SECTION_CLASSIFIER "classifier"
-#define ELF_SECTION_ACTION "action"
-
-#define ELF_MAX_MAPS 64
-#define ELF_MAX_LICENSE_LEN 128
-
-/* Object pinning settings */
-#define PIN_NONE 0
-#define PIN_OBJECT_NS 1
-#define PIN_GLOBAL_NS 2
-
-/* ELF map definition */
-struct bpf_elf_map {
- __u32 type;
- __u32 size_key;
- __u32 size_value;
- __u32 max_elem;
- __u32 flags;
- __u32 id;
- __u32 pinning;
- __u32 inner_id;
- __u32 inner_idx;
-};
-
-#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \
- struct ____btf_map_##name { \
- type_key key; \
- type_val value; \
- }; \
- struct ____btf_map_##name \
- __attribute__ ((section(".maps." #name), used)) \
- ____btf_map_##name = { }
-
-#endif /* __BPF_ELF__ */
deleted file mode 100644
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2023 Stephen Hemminger <stephen@networkplumber.org>
-
-import argparse
-import sys
-import struct
-from tempfile import TemporaryFile
-from elftools.elf.elffile import ELFFile
-
-
-def load_sections(elffile):
- """Get sections of interest from ELF"""
- result = []
- parts = [("cls_q", "cls_q_insns"), ("l3_l4", "l3_l4_hash_insns")]
- for name, tag in parts:
- section = elffile.get_section_by_name(name)
- if section:
- insns = struct.iter_unpack('<BBhL', section.data())
- result.append([tag, insns])
- return result
-
-
-def dump_section(name, insns, out):
- """Dump the array of BPF instructions"""
- print(f'\nstatic struct bpf_insn {name}[] = {{', file=out)
- for bpf in insns:
- code = bpf[0]
- src = bpf[1] >> 4
- dst = bpf[1] & 0xf
- off = bpf[2]
- imm = bpf[3]
- print(f'\t{{{code:#04x}, {dst:4d}, {src:4d}, {off:8d}, {imm:#010x}}},',
- file=out)
- print('};', file=out)
-
-
-def parse_args():
- """Parse command line arguments"""
- parser = argparse.ArgumentParser()
- parser.add_argument('-s',
- '--source',
- type=str,
- help="original source file")
- parser.add_argument('-o', '--out', type=str, help="output C file path")
- parser.add_argument("file",
- nargs='+',
- help="object file path or '-' for stdin")
- return parser.parse_args()
-
-
-def open_input(path):
- """Open the file or stdin"""
- if path == "-":
- temp = TemporaryFile()
- temp.write(sys.stdin.buffer.read())
- return temp
- return open(path, 'rb')
-
-
-def write_header(out, source):
- """Write file intro header"""
- print("/* SPDX-License-Identifier: BSD-3-Clause", file=out)
- if source:
- print(f' * Auto-generated from {source}', file=out)
- print(" * This not the original source file. Do NOT edit it.", file=out)
- print(" */\n", file=out)
-
-
-def main():
- '''program main function'''
- args = parse_args()
-
- with open(args.out, 'w',
- encoding="utf-8") if args.out else sys.stdout as out:
- write_header(out, args.source)
- for path in args.file:
- elffile = ELFFile(open_input(path))
- sections = load_sections(elffile)
- for name, insns in sections:
- dump_section(name, insns, out)
-
-
-if __name__ == "__main__":
- main()
new file mode 100644
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2024 Stephen Hemminger <stephen@networkplumber.org>
+
+enable_tap_rss = false
+
+libbpf = dependency('libbpf', required: false, method: 'pkg-config')
+if not libbpf.found()
+ message('net/tap: no RSS support missing libbpf')
+ subdir_done()
+endif
+
+# Debian install this in /usr/sbin which is not in $PATH
+bpftool = find_program('bpftool', '/usr/sbin/bpftool', required: false, version: '>= 5.6.0')
+if not bpftool.found()
+ message('net/tap: no RSS support missing bpftool')
+ subdir_done()
+endif
+
+clang_supports_bpf = false
+clang = find_program('clang', required: false)
+if clang.found()
+ clang_supports_bpf = run_command(clang, '-target', 'bpf', '--print-supported-cpus',
+ check: false).returncode() == 0
+endif
+
+if not clang_supports_bpf
+ message('net/tap: no RSS support missing clang BPF')
+ subdir_done()
+endif
+
+enable_tap_rss = true
+
+libbpf_include_dir = libbpf.get_variable(pkgconfig : 'includedir')
+
+# The include files <linux/bpf.h> and others include <asm/types.h>
+# but <asm/types.h> is not defined for multi-lib environment target.
+# Workaround by using include directoriy from the host build environment.
+machine_name = run_command('uname', '-m').stdout().strip()
+march_include_dir = '/usr/include/' + machine_name + '-linux-gnu'
+
+clang_flags = [
+ '-O2',
+ '-Wall',
+ '-Wextra',
+ '-target',
+ 'bpf',
+ '-g',
+ '-c',
+]
+
+bpf_o_cmd = [
+ clang,
+ clang_flags,
+ '-idirafter',
+ libbpf_include_dir,
+ '-idirafter',
+ march_include_dir,
+ '@INPUT@',
+ '-o',
+ '@OUTPUT@'
+]
+
+skel_h_cmd = [
+ bpftool,
+ 'gen',
+ 'skeleton',
+ '@INPUT@'
+]
+
+tap_rss_o = custom_target(
+ 'tap_rss.bpf.o',
+ input: 'tap_rss.c',
+ output: 'tap_rss.o',
+ command: bpf_o_cmd)
+
+tap_rss_skel_h = custom_target(
+ 'tap_rss.skel.h',
+ input: tap_rss_o,
+ output: 'tap_rss.skel.h',
+ command: skel_h_cmd,
+ capture: true)
deleted file mode 100644
@@ -1,255 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
- * Copyright 2017 Mellanox Technologies, Ltd
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <asm/types.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/if_ether.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/if_tunnel.h>
-#include <linux/filter.h>
-
-#include "bpf_api.h"
-#include "bpf_elf.h"
-#include "../tap_rss.h"
-
-/** Create IPv4 address */
-#define IPv4(a, b, c, d) ((__u32)(((a) & 0xff) << 24) | \
- (((b) & 0xff) << 16) | \
- (((c) & 0xff) << 8) | \
- ((d) & 0xff))
-
-#define PORT(a, b) ((__u16)(((a) & 0xff) << 8) | \
- ((b) & 0xff))
-
-/*
- * The queue number is offset by a unique QUEUE_OFFSET, to distinguish
- * packets that have gone through this rule (skb->cb[1] != 0) from others.
- */
-#define QUEUE_OFFSET 0x7cafe800
-#define PIN_GLOBAL_NS 2
-
-#define KEY_IDX 0
-#define BPF_MAP_ID_KEY 1
-
-struct vlan_hdr {
- __be16 proto;
- __be16 tci;
-};
-
-struct bpf_elf_map __attribute__((section("maps"), used))
-map_keys = {
- .type = BPF_MAP_TYPE_HASH,
- .id = BPF_MAP_ID_KEY,
- .size_key = sizeof(__u32),
- .size_value = sizeof(struct rss_key),
- .max_elem = 256,
- .pinning = PIN_GLOBAL_NS,
-};
-
-__section("cls_q") int
-match_q(struct __sk_buff *skb)
-{
- __u32 queue = skb->cb[1];
- /* queue is set by tap_flow_bpf_cls_q() before load */
- volatile __u32 q = 0xdeadbeef;
- __u32 match_queue = QUEUE_OFFSET + q;
-
- /* printt("match_q$i() queue = %d\n", queue); */
-
- if (queue != match_queue)
- return TC_ACT_OK;
-
- /* queue match */
- skb->cb[1] = 0;
- return TC_ACT_UNSPEC;
-}
-
-
-struct ipv4_l3_l4_tuple {
- __u32 src_addr;
- __u32 dst_addr;
- __u16 dport;
- __u16 sport;
-} __attribute__((packed));
-
-struct ipv6_l3_l4_tuple {
- __u8 src_addr[16];
- __u8 dst_addr[16];
- __u16 dport;
- __u16 sport;
-} __attribute__((packed));
-
-static const __u8 def_rss_key[TAP_RSS_HASH_KEY_SIZE] = {
- 0xd1, 0x81, 0xc6, 0x2c,
- 0xf7, 0xf4, 0xdb, 0x5b,
- 0x19, 0x83, 0xa2, 0xfc,
- 0x94, 0x3e, 0x1a, 0xdb,
- 0xd9, 0x38, 0x9e, 0x6b,
- 0xd1, 0x03, 0x9c, 0x2c,
- 0xa7, 0x44, 0x99, 0xad,
- 0x59, 0x3d, 0x56, 0xd9,
- 0xf3, 0x25, 0x3c, 0x06,
- 0x2a, 0xdc, 0x1f, 0xfc,
-};
-
-static __u32 __attribute__((always_inline))
-rte_softrss_be(const __u32 *input_tuple, const uint8_t *rss_key,
- __u8 input_len)
-{
- __u32 i, j, hash = 0;
-#pragma unroll
- for (j = 0; j < input_len; j++) {
-#pragma unroll
- for (i = 0; i < 32; i++) {
- if (input_tuple[j] & (1U << (31 - i))) {
- hash ^= ((const __u32 *)def_rss_key)[j] << i |
- (__u32)((uint64_t)
- (((const __u32 *)def_rss_key)[j + 1])
- >> (32 - i));
- }
- }
- }
- return hash;
-}
-
-static int __attribute__((always_inline))
-rss_l3_l4(struct __sk_buff *skb)
-{
- void *data_end = (void *)(long)skb->data_end;
- void *data = (void *)(long)skb->data;
- __u16 proto = (__u16)skb->protocol;
- __u32 key_idx = 0xdeadbeef;
- __u32 hash;
- struct rss_key *rsskey;
- __u64 off = ETH_HLEN;
- int j;
- __u8 *key = 0;
- __u32 len;
- __u32 queue = 0;
- bool mf = 0;
- __u16 frag_off = 0;
-
- rsskey = map_lookup_elem(&map_keys, &key_idx);
- if (!rsskey) {
- printt("hash(): rss key is not configured\n");
- return TC_ACT_OK;
- }
- key = (__u8 *)rsskey->key;
-
- /* Get correct proto for 802.1ad */
- if (skb->vlan_present && skb->vlan_proto == htons(ETH_P_8021AD)) {
- if (data + ETH_ALEN * 2 + sizeof(struct vlan_hdr) +
- sizeof(proto) > data_end)
- return TC_ACT_OK;
- proto = *(__u16 *)(data + ETH_ALEN * 2 +
- sizeof(struct vlan_hdr));
- off += sizeof(struct vlan_hdr);
- }
-
- if (proto == htons(ETH_P_IP)) {
- if (data + off + sizeof(struct iphdr) + sizeof(__u32)
- > data_end)
- return TC_ACT_OK;
-
- __u8 *src_dst_addr = data + off + offsetof(struct iphdr, saddr);
- __u8 *frag_off_addr = data + off + offsetof(struct iphdr, frag_off);
- __u8 *prot_addr = data + off + offsetof(struct iphdr, protocol);
- __u8 *src_dst_port = data + off + sizeof(struct iphdr);
- struct ipv4_l3_l4_tuple v4_tuple = {
- .src_addr = IPv4(*(src_dst_addr + 0),
- *(src_dst_addr + 1),
- *(src_dst_addr + 2),
- *(src_dst_addr + 3)),
- .dst_addr = IPv4(*(src_dst_addr + 4),
- *(src_dst_addr + 5),
- *(src_dst_addr + 6),
- *(src_dst_addr + 7)),
- .sport = 0,
- .dport = 0,
- };
- /** Fetch the L4-payer port numbers only in-case of TCP/UDP
- ** and also if the packet is not fragmented. Since fragmented
- ** chunks do not have L4 TCP/UDP header.
- **/
- if (*prot_addr == IPPROTO_UDP || *prot_addr == IPPROTO_TCP) {
- frag_off = PORT(*(frag_off_addr + 0),
- *(frag_off_addr + 1));
- mf = frag_off & 0x2000;
- frag_off = frag_off & 0x1fff;
- if (mf == 0 && frag_off == 0) {
- v4_tuple.sport = PORT(*(src_dst_port + 0),
- *(src_dst_port + 1));
- v4_tuple.dport = PORT(*(src_dst_port + 2),
- *(src_dst_port + 3));
- }
- }
- __u8 input_len = sizeof(v4_tuple) / sizeof(__u32);
- if (rsskey->hash_fields & (1 << HASH_FIELD_IPV4_L3))
- input_len--;
- hash = rte_softrss_be((__u32 *)&v4_tuple, key, 3);
- } else if (proto == htons(ETH_P_IPV6)) {
- if (data + off + sizeof(struct ipv6hdr) +
- sizeof(__u32) > data_end)
- return TC_ACT_OK;
- __u8 *src_dst_addr = data + off +
- offsetof(struct ipv6hdr, saddr);
- __u8 *src_dst_port = data + off +
- sizeof(struct ipv6hdr);
- __u8 *next_hdr = data + off +
- offsetof(struct ipv6hdr, nexthdr);
-
- struct ipv6_l3_l4_tuple v6_tuple;
- for (j = 0; j < 4; j++)
- *((uint32_t *)&v6_tuple.src_addr + j) =
- __builtin_bswap32(*((uint32_t *)
- src_dst_addr + j));
- for (j = 0; j < 4; j++)
- *((uint32_t *)&v6_tuple.dst_addr + j) =
- __builtin_bswap32(*((uint32_t *)
- src_dst_addr + 4 + j));
-
- /** Fetch the L4 header port-numbers only if next-header
- * is TCP/UDP **/
- if (*next_hdr == IPPROTO_UDP || *next_hdr == IPPROTO_TCP) {
- v6_tuple.sport = PORT(*(src_dst_port + 0),
- *(src_dst_port + 1));
- v6_tuple.dport = PORT(*(src_dst_port + 2),
- *(src_dst_port + 3));
- } else {
- v6_tuple.sport = 0;
- v6_tuple.dport = 0;
- }
-
- __u8 input_len = sizeof(v6_tuple) / sizeof(__u32);
- if (rsskey->hash_fields & (1 << HASH_FIELD_IPV6_L3))
- input_len--;
- hash = rte_softrss_be((__u32 *)&v6_tuple, key, 9);
- } else {
- return TC_ACT_PIPE;
- }
-
- queue = rsskey->queues[(hash % rsskey->nb_queues) &
- (TAP_MAX_QUEUES - 1)];
- skb->cb[1] = QUEUE_OFFSET + queue;
- /* printt(">>>>> rss_l3_l4 hash=0x%x queue=%u\n", hash, queue); */
-
- return TC_ACT_RECLASSIFY;
-}
-
-#define RSS(L) \
- __section(#L) int \
- L ## _hash(struct __sk_buff *skb) \
- { \
- return rss_ ## L (skb); \
- }
-
-RSS(l3_l4)
-
-BPF_LICENSE("Dual BSD/GPL");
new file mode 100644
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "../tap_rss.h"
+
+/*
+ * This map provides configuration information about flows
+ * which need BPF RSS.
+ *
+ * The hash is indexed by the tc_index.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u16));
+ __uint(value_size, sizeof(struct rss_key));
+ __uint(max_entries, TAP_MAX_QUEUES);
+} rss_map SEC(".maps");
+
+
+#define IP_MF 0x2000 /** IP header Flags **/
+#define IP_OFFSET 0x1FFF /** IP header fragment offset **/
+
+/*
+ * Compute Toeplitz hash over the input tuple.
+ * This is same as rte_softrss_be in lib/hash
+ * but loop needs to be setup to match BPF restrictions.
+ */
+static __u32 __attribute__((always_inline))
+softrss_be(const __u32 *input_tuple, __u32 input_len, const __u32 *key)
+{
+ __u32 i, j, hash = 0;
+
+#pragma unroll
+ for (j = 0; j < input_len; j++) {
+#pragma unroll
+ for (i = 0; i < 32; i++) {
+ if (input_tuple[j] & (1U << (31 - i)))
+ hash ^= key[j] << i | key[j + 1] >> (32 - i);
+ }
+ }
+ return hash;
+}
+
+/* Compute RSS hash for IPv4 packet.
+ * return in 0 if RSS not specified
+ */
+static __u32 __attribute__((always_inline))
+parse_ipv4(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
+{
+ struct iphdr iph;
+ __u32 off = 0;
+
+ if (bpf_skb_load_bytes_relative(skb, off, &iph, sizeof(iph), BPF_HDR_START_NET))
+ return 0; /* no IP header present */
+
+ struct {
+ __u32 src_addr;
+ __u32 dst_addr;
+ __u16 dport;
+ __u16 sport;
+ } v4_tuple = {
+ .src_addr = bpf_ntohl(iph.saddr),
+ .dst_addr = bpf_ntohl(iph.daddr),
+ };
+
+ /* If only calculating L3 hash, do it now */
+ if (hash_type & (1 << HASH_FIELD_IPV4_L3))
+ return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32) - 1, key);
+
+ /* No L4 if packet is a fragmented */
+ if ((iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
+ return 0;
+
+ /* Do RSS on UDP or TCP ports */
+ if (iph.protocol == IPPROTO_UDP || iph.protocol == IPPROTO_TCP) {
+ __u16 src_dst_port[2];
+
+ off += iph.ihl * 4;
+ if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
+ BPF_HDR_START_NET))
+ return 0; /* TCP or UDP header missing */
+
+ v4_tuple.sport = bpf_ntohs(src_dst_port[0]);
+ v4_tuple.dport = bpf_ntohs(src_dst_port[1]);
+ return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32), key);
+ }
+
+ /* Other protocol */
+ return 0;
+}
+
+/* parse ipv6 extended headers, update offset and return next proto.
+ * returns next proto on success, -1 on malformed header
+ */
+static int __attribute__((always_inline))
+skip_ip6_ext(__u16 proto, const struct __sk_buff *skb, __u32 *off, int *frag)
+{
+ struct ext_hdr {
+ __u8 next_hdr;
+ __u8 len;
+ } xh;
+ unsigned int i;
+
+ *frag = 0;
+
+#define MAX_EXT_HDRS 5
+#pragma unroll
+ for (i = 0; i < MAX_EXT_HDRS; i++) {
+ switch (proto) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
+ BPF_HDR_START_NET))
+ return -1;
+
+ *off += (xh.len + 1) * 8;
+ proto = xh.next_hdr;
+ break;
+ case IPPROTO_FRAGMENT:
+ if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
+ BPF_HDR_START_NET))
+ return -1;
+
+ *off += 8;
+ proto = xh.next_hdr;
+ *frag = 1;
+ return proto; /* this is always the last ext hdr */
+ default:
+ return proto;
+ }
+ }
+
+ /* too many extension headers give up */
+ return -1;
+}
+
+static __u32 __attribute__((always_inline))
+parse_ipv6(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
+{
+ struct {
+ __u32 src_addr[4];
+ __u32 dst_addr[4];
+ __u16 dport;
+ __u16 sport;
+ } v6_tuple = { };
+ struct ipv6hdr ip6h;
+ __u32 off = 0, j;
+ int proto, frag;
+
+ if (bpf_skb_load_bytes_relative(skb, off, &ip6h, sizeof(ip6h), BPF_HDR_START_NET))
+ return 0;
+
+#pragma unroll
+ for (j = 0; j < 4; j++) {
+ v6_tuple.src_addr[j] = bpf_ntohl(ip6h.saddr.in6_u.u6_addr32[j]);
+ v6_tuple.dst_addr[j] = bpf_ntohl(ip6h.daddr.in6_u.u6_addr32[j]);
+ }
+
+ if (hash_type & (1 << HASH_FIELD_IPV6_L3))
+ return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32) - 1, key);
+
+ off += sizeof(ip6h);
+ proto = skip_ip6_ext(ip6h.nexthdr, skb, &off, &frag);
+ if (proto < 0)
+ return 0;
+
+ if (frag)
+ return 0;
+
+ /* Do RSS on UDP or TCP ports */
+ if (proto == IPPROTO_UDP || proto == IPPROTO_TCP) {
+ __u16 src_dst_port[2];
+
+ if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
+ BPF_HDR_START_NET))
+ return 0;
+
+ v6_tuple.sport = bpf_ntohs(src_dst_port[0]);
+ v6_tuple.dport = bpf_ntohs(src_dst_port[1]);
+
+ return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32), key);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute RSS hash for packets.
+ * Returns 0 if no hash is possible.
+ */
+static __u32 __attribute__((always_inline))
+calculate_rss_hash(const struct __sk_buff *skb, const struct rss_key *rsskey)
+{
+ const __u32 *key = (const __u32 *)rsskey->key;
+
+ if (skb->protocol == bpf_htons(ETH_P_IP))
+ return parse_ipv4(skb, rsskey->hash_fields, key);
+ else if (skb->protocol == bpf_htons(ETH_P_IPV6))
+ return parse_ipv6(skb, rsskey->hash_fields, key);
+ else
+ return 0;
+}
+
+/* scale value to be into range [0, n), assumes val is large */
+static __u32 __attribute__((always_inline))
+reciprocal_scale(__u32 val, __u32 n)
+{
+ return (__u32)(((__u64)val * n) >> 32);
+}
+
+/* layout of qdisc skb cb (from sch_generic.h) */
+struct qdisc_skb_cb {
+ struct {
+ unsigned int pkt_len;
+ __u16 dev_queue_mapping;
+ __u16 tc_classid;
+ };
+#define QDISC_CB_PRIV_LEN 20
+ unsigned char data[QDISC_CB_PRIV_LEN];
+};
+
+/*
+ * When this BPF program is run by tc from the filter classifier,
+ * it is able to read skb metadata and packet data.
+ *
+ * For packets where RSS is not possible, then just return TC_ACT_OK.
+ * When RSS is desired, change the skb->queue_mapping and set TC_ACT_PIPE
+ * to continue processing.
+ *
+ * This should be BPF_PROG_TYPE_SCHED_ACT so section needs to be "action"
+ */
+SEC("action") int
+rss_flow_action(struct __sk_buff *skb)
+{
+ const struct rss_key *rsskey;
+ __u16 classid;
+ __u32 hash;
+
+ /* TC layer puts the BPF_CLASSID into the skb cb area */
+ classid = ((const struct qdisc_skb_cb *)skb->cb)->tc_classid;
+
+ /* Lookup RSS configuration for that BPF class */
+ rsskey = bpf_map_lookup_elem(&rss_map, &classid);
+ if (rsskey == NULL) {
+ bpf_printk("hash(): rss not configured");
+ return TC_ACT_OK;
+ }
+
+ hash = calculate_rss_hash(skb, rsskey);
+ bpf_printk("hash %u\n", hash);
+ if (hash) {
+ /* Fold hash to the number of queues configured */
+ skb->queue_mapping = reciprocal_scale(hash, rsskey->nb_queues);
+ bpf_printk("queue %u\n", skb->queue_mapping);
+ return TC_ACT_PIPE;
+ }
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "Dual BSD/GPL";