字段 | 解释 |
bytes | The total number of bytes of data transmitted or received by the interface. |
packets | The total number of packets of data transmitted or received by the interface. |
errs | The total number of transmit or receive errors detected by the device driver. |
drop | The total number of packets dropped by the device driver. |
fifo | The number of FIFO buffer errors. |
frame | The number of packet framing errors. |
colls | The number of collisions detected on the interface. |
compressed | The number of compressed packets transmitted or received by the device driver. (This appears to be unused in the 2.2.15 kernel.) |
carrier | The number of carrier losses detected by the device driver. |
multicast | The number of multicast frames transmitted or received by the device driver. |
static int dev_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
/* 输出/proc/net/dev表头部分 */
seq_puts(seq, "Inter-| Receive "
" | Transmit\n"
" face |bytes packets errs drop fifo frame "
"compressed multicast|bytes packets errs "
"drop fifo colls carrier compressed\n");
else
/* 输出/proc/net/dev数据部分 */
dev_seq_printf_stats(seq, v);
return 0;
}
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
{
struct rtnl_link_stats64 temp;
/* 数据源从下面的函数中取得 */
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
/* /proc/net/dev 各个字段的数据算法 */
seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
"%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
dev->name, stats->rx_bytes, stats->rx_packets,
stats->rx_errors,
stats->rx_dropped + stats->rx_missed_errors,
stats->rx_fifo_errors,
stats->rx_length_errors + stats->rx_over_errors +
stats->rx_crc_errors + stats->rx_frame_errors,
stats->rx_compressed, stats->multicast,
stats->tx_bytes, stats->tx_packets,
stats->tx_errors, stats->tx_dropped,
stats->tx_fifo_errors, stats->collisions,
stats->tx_carrier_errors +
stats->tx_aborted_errors +
stats->tx_window_errors +
stats->tx_heartbeat_errors,
stats->tx_compressed);
}
/**
* dev_get_stats - get network device statistics
* @dev: device to get statistics from
* @storage: place to store stats
*
* Get network statistics from device. Return @storage.
* The device driver may provide its own method by setting
* dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
* otherwise the internal statistics structure is used.
*/
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
return storage;
}
/* The main device statistics structure */
struct rtnl_link_stats64 {
__u64 rx_packets; /* total packets received */
__u64 tx_packets; /* total packets transmitted */
__u64 rx_bytes; /* total bytes received */
__u64 tx_bytes; /* total bytes transmitted */
__u64 rx_errors; /* bad packets received */
__u64 tx_errors; /* packet transmit problems */
__u64 rx_dropped; /* no space in linux buffers */
__u64 tx_dropped; /* no space available in linux */
__u64 multicast; /* multicast packets received */
__u64 collisions;
/* detailed rx_errors: */
__u64 rx_length_errors;
__u64 rx_over_errors; /* receiver ring buff overflow */
__u64 rx_crc_errors; /* recved pkt with crc error */
__u64 rx_frame_errors; /* recv'd frame alignment error */
__u64 rx_fifo_errors; /* recv'r fifo overrun */
__u64 rx_missed_errors; /* receiver missed packet */
/* detailed tx_errors */
__u64 tx_aborted_errors;
__u64 tx_carrier_errors;
__u64 tx_fifo_errors;
__u64 tx_heartbeat_errors;
__u64 tx_window_errors;
/* for cslip etc */
__u64 rx_compressed;
__u64 tx_compressed;
};
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
/* 判断接收队列是否满,队列长度为 netdev_max_backlog */
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
/* 队列如果不会空,将数据包添加到队列尾 */
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
/* 队列如果为空,回到 ____napi_schedule加入poll_list轮询部分,并重新发起软中断 */
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
/* 队列满则直接丢弃,对应计数器 +1 */
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
/**
* 文件:ixgbe_main.c
* ixgbe_request_irq - initialize interrupts
* @adapter: board private structure
*
* Attempts to configure interrupts using the best available
* capabilities of the hardware and kernel.
**/
static int ixgbe_request_irq(struct ixgbe_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
int err;
/* 支持MSIX,调用 ixgbe_request_msix_irqs 设置中断处理程序*/
if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED)
err = ixgbe_request_msix_irqs(adapter);
/* 支持MSI,直接设置 ixgbe_intr 为中断处理程序 */
else if (adapter->flags & IXGBE_FLAG_MSI_ENABLED)
err = request_irq(adapter->pdev->irq, &ixgbe_intr, 0,
netdev->name, adapter);
/* 都不支持的情况,直接设置 ixgbe_intr 为中断处理程序 */
else
err = request_irq(adapter->pdev->irq, &ixgbe_intr, IRQF_SHARED,
netdev->name, adapter);
if (err)
e_err(probe, "request_irq failed, Error %d\n", err);
return err;
}
/**
* 文件:ixgbe_main.c
* ixgbe_request_msix_irqs - Initialize MSI-X interrupts
* @adapter: board private structure
*
* ixgbe_request_msix_irqs allocates MSI-X vectors and requests
* interrupts from the kernel.
**/
static int (struct ixgbe_adapter *adapter)
{
…
for (vector = 0; vector < adapter->num_q_vectors; vector++) {
struct ixgbe_q_vector *q_vector = adapter->q_vector[vector];
struct msix_entry *entry = &adapter->msix_entries[vector];
/* 设置中断处理入口函数为 ixgbe_msix_clean_rings */
err = request_irq(entry->vector, &ixgbe_msix_clean_rings, 0,
q_vector->name, q_vector);
if (err) {
e_err(probe, "request_irq failed for MSIX interrupt '%s' "
"Error: %d\n", q_vector->name, err);
goto free_queue_irqs;
}
…
}
}
/**
* 文件:ixgbe_main.c
**/
static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)
{
struct ixgbe_q_vector *q_vector = data;
/* EIAM disabled interrupts (on this vector) for us */
if (q_vector->rx.ring || q_vector->tx.ring)
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
/**
* 文件:include/linux/netdevice.h
* napi_schedule - schedule NAPI poll
* @n: NAPI context
*
* Schedule NAPI poll routine to be called if it is not already
* running.
*/
static inline void napi_schedule(struct napi_struct *n)
{
if (napi_schedule_prep(n))
/* 注意下面调用的这个函数名字前是两个下划线 */
__napi_schedule(n);
}
/**
* 文件:net/core/dev.c
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run.
* Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
/* local_irq_save用来保存中断状态,并禁止中断 */
local_irq_save(flags);
/* 注意下面调用的这个函数名字前是四个下划线,传入的 softnet_data 是当前CPU */
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
/* 将 napi_struct 加入 softnet_data 的 poll_list */
list_add_tail(&napi->poll_list, &sd->poll_list);
/* 发起软中断 NET_RX_SOFTIRQ */
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
/*
* 文件:net/core/dev.c
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/
static int __init net_dev_init(void)
{
…
/* 分别注册TX和RX软中断的处理程序 */
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
…
}
/* 文件:net/core/dev.c */
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&sd->poll_list)) {
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
/* 判断处理包数是否超过 netdev_budget 及时间是否超过2个时间片 */
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n);
}
……
}
}
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
#ifdef CONFIG_RPS
/* 判断是否启用RPS机制 */
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
/* 获取对应的CPU Core */
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}
# lspci -vvv | grep Eth
01:00.0 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 03)
Subsystem: Dell Ethernet 10G 4P X540/I350 rNDC
01:00.1 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 03)
Subsystem: Dell Ethernet 10G 4P X540/I350 rNDC
# lspci -vvv
07:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
Subsystem: Dell Gigabit 4P X540/I350 rNDC
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
Latency: 0, Cache Line Size: 128 bytes
Interrupt: pin D routed to IRQ 19
Region 0: Memory at 92380000 (32-bit, non-prefetchable) [size=512K]
Region 3: Memory at 92404000 (32-bit, non-prefetchable) [size=16K]
Expansion ROM at 92a00000 [disabled] [size=512K]
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=1 PME-
Capabilities: [50] MSI: Enable- Count=1/1 Maskable+ 64bit+
Address: 0000000000000000 Data: 0000
Masking: 00000000 Pending: 00000000
Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
Vector table: BAR=3 offset=00000000
PBA: BAR=3 offset=00002000
...
#define IXGBE_MAX_MSIX_VECTORS_82599 0x40
...
u16 ixgbe_get_pcie_msix_count_generic(struct ixgbe_hw *hw)
{
u16 msix_count;
u16 max_msix_count;
u16 pcie_offset;
switch (hw->mac.type) {
case ixgbe_mac_82598EB:
pcie_offset = IXGBE_PCIE_MSIX_82598_CAPS;
max_msix_count = IXGBE_MAX_MSIX_VECTORS_82598;
break;
case ixgbe_mac_82599EB:
case ixgbe_mac_X540:
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
case ixgbe_mac_x550em_a:
pcie_offset = IXGBE_PCIE_MSIX_82599_CAPS;
max_msix_count = IXGBE_MAX_MSIX_VECTORS_82599;
break;
default:
return 1;
}
...
static int ixgbe_acquire_msix_vectors(struct ixgbe_adapter *adapter)
{
struct ixgbe_hw *hw = &adapter->hw;
int i, vectors, vector_threshold;
/* We start by asking for one vector per queue pair with XDP queues
* being stacked with TX queues.
*/
vectors = max(adapter->num_rx_queues, adapter->num_tx_queues);
vectors = max(vectors, adapter->num_xdp_queues);
/* It is easy to be greedy for MSI-X vectors. However, it really
* doesn't do much good if we have a lot more vectors than CPUs. We'll
* be somewhat conservative and only ask for (roughly) the same number
* of vectors as there are CPUs.
*/
vectors = min_t(int, vectors, num_online_cpus());
global hard, soft, wq
probe irq_handler.entry {
hard[irq, dev_name]++;
}
probe timer.s(1) {
println("==irq number:dev_name")
foreach( [irq, dev_name] in hard- limit 5) {
printf("%d,%s->%d\n", irq, kernel_string(dev_name), hard[irq, dev_name]);
}
println("==softirq cpu:h:vec:action")
foreach( [c,h,vec,action] in soft- limit 5) {
printf("%d:%x:%x:%s->%d\n", c, h, vec, symdata(action), soft[c,h,vec,action]);
}
println("==workqueue wq_thread:work_func")
foreach( [wq_thread,work_func] in wq- limit 5) {
printf("%x:%x->%d\n", wq_thread, work_func, wq[wq_thread, work_func]);
}
println("\n")
delete hard
delete soft
delete wq
}
probe softirq.entry {
soft[cpu(), h,vec,action]++;
}
probe workqueue.execute {
wq[wq_thread, work_func]++
}
probe begin {
println("~")
}
==irq number:dev_name
87,eth0-0->1693
90,eth0-3->1263
95,eth1-3->746
92,eth1-0->703
89,eth0-2->654
==softirq cpu:h:vec:action
0:ffffffff81a83098:ffffffff81a83080:0xffffffff81461a00->8928
0:ffffffff81a83088:ffffffff81a83080:0xffffffff81084940->626
0:ffffffff81a830c8:ffffffff81a83080:0xffffffff810ecd70->614
16:ffffffff81a83088:ffffffff81a83080:0xffffffff81084940->225
16:ffffffff81a830c8:ffffffff81a83080:0xffffffff810ecd70->224
==workqueue wq_thread:work_func
ffff88083062aae0:ffffffffa01c53d0->10
ffff88083062aae0:ffffffffa01ca8f0->10
ffff88083420a080:ffffffff81142160->2
ffff8808343fe040:ffffffff8127c9d0->2
ffff880834282ae0:ffffffff8133bd20->1
addr2line -e /usr/lib/debug/lib/modules/2.6.32-431.20.3.el6.mt20161028.x86_64/vmlinux ffffffff81461a00
/usr/src/debug/kernel-2.6.32-431.20.3.el6/linux-2.6.32-431.20.3.el6.mt20161028.x86_64/net/core/dev.c:4013
欢迎光临 168大数据 (http://www.bi168.cn/) | Powered by Discuz! X3.2 |