diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-12 18:57:02 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-12 18:57:02 -0800 |
| commit | aee3bfa3307cd0da2126bdc0ea359dabea5ee8f7 (patch) | |
| tree | 3d35c69e8fa835098bb90f77f30abed120681651 /include/linux/cgroup-defs.h | |
| parent | c597b6bcd5c624534afc3df65cdc42bb05173bca (diff) | |
| parent | 415b6f19e87e350b13585591859d4fdf50772229 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from Davic Miller:
1) Support busy polling generically, for all NAPI drivers. From Eric
Dumazet.
2) Add byte/packet counter support to nft_ct, from Floriani Westphal.
3) Add RSS/XPS support to mvneta driver, from Gregory Clement.
4) Implement IPV6_HDRINCL socket option for raw sockets, from Hannes
Frederic Sowa.
5) Add support for T6 adapter to cxgb4 driver, from Hariprasad Shenai.
6) Add support for VLAN device bridging to mlxsw switch driver, from
Ido Schimmel.
7) Add driver for Netronome NFP4000/NFP6000, from Jakub Kicinski.
8) Provide hwmon interface to mlxsw switch driver, from Jiri Pirko.
9) Reorganize wireless drivers into per-vendor directories just like we
do for ethernet drivers. From Kalle Valo.
10) Provide a way for administrators "destroy" connected sockets via the
SOCK_DESTROY socket netlink diag operation. From Lorenzo Colitti.
11) Add support to add/remove multicast routes via netlink, from Nikolay
Aleksandrov.
12) Make TCP keepalive settings per-namespace, from Nikolay Borisov.
13) Add forwarding and packet duplication facilities to nf_tables, from
Pablo Neira Ayuso.
14) Dead route support in MPLS, from Roopa Prabhu.
15) TSO support for thunderx chips, from Sunil Goutham.
16) Add driver for IBM's System i/p VNIC protocol, from Thomas Falcon.
17) Rationalize, consolidate, and more completely document the checksum
offloading facilities in the networking stack. From Tom Herbert.
18) Support aborting an ongoing scan in mac80211/cfg80211, from
Vidyullatha Kanchanapally.
19) Use per-bucket spinlock for bpf hash facility, from Tom Leiming.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1375 commits)
net: bnxt: always return values from _bnxt_get_max_rings
net: bpf: reject invalid shifts
phonet: properly unshare skbs in phonet_rcv()
dwc_eth_qos: Fix dma address for multi-fragment skbs
phy: remove an unneeded condition
mdio: remove an unneed condition
mdio_bus: NULL dereference on allocation error
net: Fix typo in netdev_intersect_features
net: freescale: mac-fec: Fix build error from phy_device API change
net: freescale: ucc_geth: Fix build error from phy_device API change
bonding: Prevent IPv6 link local address on enslaved devices
IB/mlx5: Add flow steering support
net/mlx5_core: Export flow steering API
net/mlx5_core: Make ipv4/ipv6 location more clear
net/mlx5_core: Enable flow steering support for the IB driver
net/mlx5_core: Initialize namespaces only when supported by device
net/mlx5_core: Set priority attributes
net/mlx5_core: Connect flow tables
net/mlx5_core: Introduce modify flow table command
net/mlx5_core: Managing root flow table
...
Diffstat (limited to 'include/linux/cgroup-defs.h')
| -rw-r--r-- | include/linux/cgroup-defs.h | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 06b77f9dd3f2..e5f4164cbd99 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -231,6 +231,14 @@ struct cgroup { int id; /* + * The depth this cgroup is at. The root is at depth zero and each + * step down the hierarchy increments the level. This along with + * ancestor_ids[] can determine whether a given cgroup is a + * descendant of another without traversing the hierarchy. + */ + int level; + + /* * Each non-empty css_set associated with this cgroup contributes * one to populated_cnt. All children with non-zero popuplated_cnt * of their own contribute one. The count is zero iff there's no @@ -285,6 +293,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + + /* ids of the ancestors at each level including self */ + int ancestor_ids[]; }; /* @@ -304,6 +315,9 @@ struct cgroup_root { /* The root cgroup. Root is destroyed on its release. */ struct cgroup cgrp; + /* for cgrp->ancestor_ids[0] */ + int cgrp_ancestor_id_storage; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -521,4 +535,116 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +/* + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains + * per-socket cgroup information except for memcg association. + * + * On legacy hierarchies, net_prio and net_cls controllers directly set + * attributes on each sock which can then be tested by the network layer. + * On the default hierarchy, each sock is associated with the cgroup it was + * created in and the networking layer can match the cgroup directly. + * + * To avoid carrying all three cgroup related fields separately in sock, + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. + * On boot, sock_cgroup_data records the cgroup that the sock was created + * in so that cgroup2 matches can be made; however, once either net_prio or + * net_cls starts being used, the area is overriden to carry prioidx and/or + * classid. The two modes are distinguished by whether the lowest bit is + * set. Clear bit indicates cgroup pointer while set bit prioidx and + * classid. + * + * While userland may start using net_prio or net_cls at any time, once + * either is used, cgroup2 matching no longer works. There is no reason to + * mix the two and this is in line with how legacy and v2 compatibility is + * handled. On mode switch, cgroup references which are already being + * pointed to by socks may be leaked. While this can be remedied by adding + * synchronization around sock_cgroup_data, given that the number of leaked + * cgroups is bound and highly unlikely to be high, this seems to be the + * better trade-off. + */ +struct sock_cgroup_data { + union { +#ifdef __LITTLE_ENDIAN + struct { + u8 is_data; + u8 padding; + u16 prioidx; + u32 classid; + } __packed; +#else + struct { + u32 classid; + u16 prioidx; + u8 padding; + u8 is_data; + } __packed; +#endif + u64 val; + }; +}; + +/* + * There's a theoretical window where the following accessors race with + * updaters and return part of the previous pointer as the prioidx or + * classid. Such races are short-lived and the result isn't critical. + */ +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +{ + /* fallback to 1 which is always the ID of the root cgroup */ + return (skcd->is_data & 1) ? skcd->prioidx : 1; +} + +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +{ + /* fallback to 0 which is the unconfigured default classid */ + return (skcd->is_data & 1) ? skcd->classid : 0; +} + +/* + * If invoked concurrently, the updaters may clobber each other. The + * caller is responsible for synchronization. + */ +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, + u16 prioidx) +{ + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; + + if (sock_cgroup_prioidx(&skcd_buf) == prioidx) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.prioidx = prioidx; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +} + +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, + u32 classid) +{ + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; + + if (sock_cgroup_classid(&skcd_buf) == classid) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.classid = classid; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +} + +#else /* CONFIG_SOCK_CGROUP_DATA */ + +struct sock_cgroup_data { +}; + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_DEFS_H */ |
