From 733a0e51357e340d41a6af24bc5169b00f6e2ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=93=AD=E8=BD=A9?= <11397959+zhangmingxuan777@user.noreply.gitee.com> Date: Fri, 7 Mar 2025 16:29:26 +0800 Subject: [PATCH] send packet and local-net --- linux-6.2/arch/x86/include/asm/current.h | 2 + linux-6.2/arch/x86/include/asm/hardirq.h | 1 + .../drivers/net/ethernet/intel/igb/igb_main.c | 25 +++++-- linux-6.2/drivers/net/loopback.c | 3 + linux-6.2/include/linux/init.h | 1 + linux-6.2/include/linux/interrupt.h | 1 + linux-6.2/include/linux/netdevice.h | 7 +- linux-6.2/include/linux/skbuff.h | 2 + linux-6.2/include/linux/socket.h | 9 ++- linux-6.2/include/linux/uaccess.h | 6 +- linux-6.2/include/linux/uio.h | 2 + linux-6.2/include/linux/wait.h | 1 + linux-6.2/include/net/dst.h | 1 + linux-6.2/include/net/ip_fib.h | 2 + linux-6.2/include/net/neighbour.h | 4 ++ linux-6.2/include/net/route.h | 6 ++ linux-6.2/include/net/sock.h | 7 ++ linux-6.2/include/net/tcp.h | 2 +- linux-6.2/init/main.c | 2 +- linux-6.2/kernel/sched/wait.c | 1 + linux-6.2/lib/iov_iter.c | 6 +- linux-6.2/net/core/dev.c | 42 ++++++++++-- linux-6.2/net/core/neighbour.c | 17 +++-- linux-6.2/net/core/sock.c | 15 ++++- linux-6.2/net/core/stream.c | 8 +++ linux-6.2/net/ipv4/af_inet.c | 16 +++-- linux-6.2/net/ipv4/arp.c | 1 + linux-6.2/net/ipv4/inet_connection_sock.c | 23 ++++++- linux-6.2/net/ipv4/ip_output.c | 49 +++++++++++--- linux-6.2/net/ipv4/route.c | 2 + linux-6.2/net/ipv4/tcp.c | 67 ++++++++++++++++--- linux-6.2/net/ipv4/tcp_input.c | 2 + linux-6.2/net/ipv4/tcp_output.c | 46 +++++++++---- linux-6.2/net/ipv4/tcp_rate.c | 2 + linux-6.2/net/sched/sch_generic.c | 10 ++- linux-6.2/net/socket.c | 26 +++++-- linux-6.2/security/security.c | 1 + 37 files changed, 352 insertions(+), 66 deletions(-) diff --git a/linux-6.2/arch/x86/include/asm/current.h b/linux-6.2/arch/x86/include/asm/current.h index a1168e7b6..b043c3cbf 100644 --- a/linux-6.2/arch/x86/include/asm/current.h +++ b/linux-6.2/arch/x86/include/asm/current.h @@ -11,6 +11,7 @@ struct task_struct; +//存储该 CPU 核心的一些运行时信息 struct pcpu_hot { union { struct { @@ -22,6 +23,7 @@ struct pcpu_hot { #endif unsigned long top_of_stack; void *hardirq_stack_ptr; + //存储软中断状态 u16 softirq_pending; #ifdef CONFIG_X86_64 bool hardirq_stack_inuse; diff --git a/linux-6.2/arch/x86/include/asm/hardirq.h b/linux-6.2/arch/x86/include/asm/hardirq.h index 66837b8c6..f5f1f0557 100644 --- a/linux-6.2/arch/x86/include/asm/hardirq.h +++ b/linux-6.2/arch/x86/include/asm/hardirq.h @@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +//x86 用于存储当前CPU的软中断(softirq)状态。 #define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/linux-6.2/drivers/net/ethernet/intel/igb/igb_main.c b/linux-6.2/drivers/net/ethernet/intel/igb/igb_main.c index 394bec227..c79e26548 100644 --- a/linux-6.2/drivers/net/ethernet/intel/igb/igb_main.c +++ b/linux-6.2/drivers/net/ethernet/intel/igb/igb_main.c @@ -1226,8 +1226,8 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter, memset(q_vector, 0, size); } if (!q_vector) - return -ENOMEM; - + return -ENOMEM; + /* initialize NAPI */ //使用 netif_napi_add 注册 NAPI,并指定 igb_poll 作为网络中断的处理函数。NAPI 是一种机制,用于减少中断频率并提高网络吞吐量。 netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll); @@ -2165,6 +2165,7 @@ int igb_up(struct igb_adapter *adapter) wr32(E1000_CTRL_EXT, reg_data); } + //启用网卡 netif_tx_start_all_queues(adapter->netdev); /* start the watchdog. */ @@ -4258,6 +4259,7 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring) struct device *dev = tx_ring->dev; int size; + //申请igb_tx_buffer数组 size = sizeof(struct igb_tx_buffer) * tx_ring->count; tx_ring->tx_buffer_info = vmalloc(size); @@ -6177,6 +6179,7 @@ static inline int igb_maybe_stop_tx(struct igb_ring *tx_ring, const u16 size) return __igb_maybe_stop_tx(tx_ring, size); } +//将数据放入缓冲区 static int igb_tx_map(struct igb_ring *tx_ring, struct igb_tx_buffer *first, const u8 hdr_len) @@ -6191,17 +6194,22 @@ static int igb_tx_map(struct igb_ring *tx_ring, u32 cmd_type = igb_tx_cmd_type(skb, tx_flags); u16 i = tx_ring->next_to_use; + //获取发送描述符 tx_desc = IGB_TX_DESC(tx_ring, i); igb_tx_olinfo_status(tx_ring, tx_desc, tx_flags, skb->len - hdr_len); + //计算头部和数据长度 size = skb_headlen(skb); data_len = skb->data_len; + //将数据映射到 DMA 地址 dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); tx_buffer = first; + + //负责处理包含多个片段的 TCP/UDP 数据包 遍历数据包 skb 的所有片段 for (frag = &skb_shinfo(skb)->frags[0];; frag++) { if (dma_mapping_error(tx_ring->dev, dma)) goto dma_error; @@ -6245,7 +6253,7 @@ static int igb_tx_map(struct igb_ring *tx_ring, size = skb_frag_size(frag); data_len -= size; - + //将当前片段映射到设备的 DMA 地址 dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size, DMA_TO_DEVICE); @@ -6253,6 +6261,7 @@ static int igb_tx_map(struct igb_ring *tx_ring, } /* write last descriptor with RS and EOP bits */ + //设置最后一个描述符 cmd_type |= size | IGB_TXD_DCMD; tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); @@ -6430,6 +6439,7 @@ int igb_xmit_xdp_ring(struct igb_adapter *adapter, return IGB_XDP_CONSUMED; } +//igb网卡发送数据 netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, struct igb_ring *tx_ring) { @@ -6457,12 +6467,14 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, } /* record the location of the first descriptor for this packet */ + //记录第一个描述符的位置 first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; first->type = IGB_TYPE_SKB; first->skb = skb; first->bytecount = skb->len; first->gso_segs = 1; + //时间戳 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { struct igb_adapter *adapter = netdev_priv(tx_ring->netdev); @@ -6481,6 +6493,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, } } + //vlan if (skb_vlan_tag_present(skb)) { tx_flags |= IGB_TX_FLAGS_VLAN; tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT); @@ -6496,6 +6509,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, else if (!tso) igb_tx_csum(tx_ring, first); + //调用 igb_tx_map 函数将数据包映射到硬件发送环的描述符 if (igb_tx_map(tx_ring, first, hdr_len)) goto cleanup_tx_tstamp; @@ -6529,6 +6543,7 @@ static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter, return adapter->tx_ring[r_idx]; } +//igb网卡发送 static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) { @@ -8884,6 +8899,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) #if (PAGE_SIZE < 8192) frame_sz = igb_rx_frame_truesize(rx_ring, 0); #endif + //初始化和准备 XDP 缓冲区 xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); //在一个循环中,处理接收队列中的数据包。每次迭代最多处理 budget 个数据包,直到达到 budget @@ -8918,6 +8934,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) //获取缓冲区的地址 pktbuf = page_address(rx_buffer->page) + rx_buffer->page_offset; + //处理时间戳 /* pull rx packet timestamp if available and valid */ if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { int ts_hdr_len; @@ -8970,7 +8987,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) rx_buffer->pagecnt_bias++; break; } - + //释放接收缓冲区 igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt); cleaned_count++; diff --git a/linux-6.2/drivers/net/loopback.c b/linux-6.2/drivers/net/loopback.c index f6d53e63e..a6d788706 100644 --- a/linux-6.2/drivers/net/loopback.c +++ b/linux-6.2/drivers/net/loopback.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL(blackhole_netdev); /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ +//本机网络io的发送 static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -76,6 +77,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, /* do not fool net_timestamp_check() with various clock bases */ skb_clear_tstamp(skb); + //剥离和源socket的联系 skb_orphan(skb); /* Before queueing this packet to __netif_rx(), @@ -86,6 +88,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, dev); len = skb->len; + //调用__netif_rx if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_lstats_add(dev, len); diff --git a/linux-6.2/include/linux/init.h b/linux-6.2/include/linux/init.h index c5fe6d26f..016d0ad0b 100644 --- a/linux-6.2/include/linux/init.h +++ b/linux-6.2/include/linux/init.h @@ -269,6 +269,7 @@ extern bool initcall_debug; * * Only for built-in code, not modules. */ +//在SMP初始化前执行 #define early_initcall(fn) __define_initcall(fn, early) /* diff --git a/linux-6.2/include/linux/interrupt.h b/linux-6.2/include/linux/interrupt.h index 166928863..1463d28d6 100644 --- a/linux-6.2/include/linux/interrupt.h +++ b/linux-6.2/include/linux/interrupt.h @@ -520,6 +520,7 @@ DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); #ifndef local_softirq_pending +//相应架构没有定义时 #ifndef local_softirq_pending_ref //local_softirq_pending_ref实际上是irq_stat结构中的__softirq_pending,__softirq_pending字段中的每一个bit,对应着某一个软中断,某个bit被置位,说明有相应的软中断等待处理。 #define local_softirq_pending_ref irq_stat.__softirq_pending diff --git a/linux-6.2/include/linux/netdevice.h b/linux-6.2/include/linux/netdevice.h index b2ed3c620..a4289298c 100644 --- a/linux-6.2/include/linux/netdevice.h +++ b/linux-6.2/include/linux/netdevice.h @@ -3037,6 +3037,7 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +//设备子系统入口函数 static inline int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); @@ -4867,7 +4868,7 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev, return hwtstamps->hwtstamp; } - +//网络设备子系统 发送 static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) @@ -4881,13 +4882,15 @@ static inline bool netdev_xmit_more(void) return __this_cpu_read(softnet_data.xmit.more); } +//发送数据包 static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; netdev_tx_t rc; - + //调用 rc = __netdev_start_xmit(ops, skb, dev, more); + //更新时间戳 if (rc == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/linux-6.2/include/linux/skbuff.h b/linux-6.2/include/linux/skbuff.h index 99d1ce072..30ec75658 100644 --- a/linux-6.2/include/linux/skbuff.h +++ b/linux-6.2/include/linux/skbuff.h @@ -2095,10 +2095,12 @@ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ +//获取sk_buff_head队列的尾部元素 static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); + //skb为自己 即队列为空 if (skb == (struct sk_buff *)list_) skb = NULL; return skb; diff --git a/linux-6.2/include/linux/socket.h b/linux-6.2/include/linux/socket.h index 13c3a237b..9eeb9f381 100644 --- a/linux-6.2/include/linux/socket.h +++ b/linux-6.2/include/linux/socket.h @@ -30,9 +30,11 @@ typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ - +//对端地址 struct sockaddr { + //地址族 sa_family_t sa_family; /* address family, AF_xxx */ + //地址数据 union { char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ DECLARE_FLEX_ARRAY(char, sa_data); @@ -51,13 +53,14 @@ struct linger { * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ - +//消息结构体 struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ int msg_inq; /* output, data left in socket */ + //数据信息 struct iov_iter msg_iter; /* data */ /* @@ -166,7 +169,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr { return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } - +//获取数据长度 static inline size_t msg_data_left(struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); diff --git a/linux-6.2/include/linux/uaccess.h b/linux-6.2/include/linux/uaccess.h index afb18f198..2811e0259 100644 --- a/linux-6.2/include/linux/uaccess.h +++ b/linux-6.2/include/linux/uaccess.h @@ -116,6 +116,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } +// #ifdef INLINE_COPY_FROM_USER static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) @@ -124,6 +125,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user_before(to, from, n); + //实际操作 res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } @@ -154,9 +156,11 @@ extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif +//从用户空间拷贝 static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) -{ +{ + //检查to的空间 if (check_copy_size(to, n, false)) n = _copy_from_user(to, from, n); return n; diff --git a/linux-6.2/include/linux/uio.h b/linux-6.2/include/linux/uio.h index 9f158238e..61d3c6e34 100644 --- a/linux-6.2/include/linux/uio.h +++ b/linux-6.2/include/linux/uio.h @@ -38,6 +38,7 @@ struct iov_iter_state { unsigned long nr_segs; }; +//数据信息 struct iov_iter { u8 iter_type; bool nofault; @@ -49,6 +50,7 @@ struct iov_iter { }; size_t count; union { + //实际数据 const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; diff --git a/linux-6.2/include/linux/wait.h b/linux-6.2/include/linux/wait.h index 100d02159..5356ffe11 100644 --- a/linux-6.2/include/linux/wait.h +++ b/linux-6.2/include/linux/wait.h @@ -168,6 +168,7 @@ extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wai extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +//将wq_entry放入wq_head static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; diff --git a/linux-6.2/include/net/dst.h b/linux-6.2/include/net/dst.h index 660ee9ac2..12d6e0ba7 100644 --- a/linux-6.2/include/net/dst.h +++ b/linux-6.2/include/net/dst.h @@ -439,6 +439,7 @@ INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ +//出口函数 回调ip_output static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->output, diff --git a/linux-6.2/include/net/ip_fib.h b/linux-6.2/include/net/ip_fib.h index a378eff82..dba5c6cfd 100644 --- a/linux-6.2/include/net/ip_fib.h +++ b/linux-6.2/include/net/ip_fib.h @@ -306,6 +306,7 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id) return fib_get_table(net, id); } +//查找路由核心函数 static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { @@ -314,6 +315,7 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, rcu_read_lock(); + //查询main路由表 tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); diff --git a/linux-6.2/include/net/neighbour.h b/linux-6.2/include/net/neighbour.h index 2f2a6023f..f4fe3177a 100644 --- a/linux-6.2/include/net/neighbour.h +++ b/linux-6.2/include/net/neighbour.h @@ -496,6 +496,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb unsigned int seq; unsigned int hh_len; + ///* 拷贝二层头到skb */ do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); @@ -530,6 +531,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb return dev_queue_xmit(skb); } +//邻居子系统进入 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { @@ -538,11 +540,13 @@ static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ + //有连接 且缓存头部存在 if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); + //arp_constructor时定义为dev_queue_xmit return n->output(n, skb); } diff --git a/linux-6.2/include/net/route.h b/linux-6.2/include/net/route.h index 6e92dd5bc..7bd9e6cf3 100644 --- a/linux-6.2/include/net/route.h +++ b/linux-6.2/include/net/route.h @@ -168,6 +168,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, return ip_route_output_key(net, &fl4); } +//查找路由表 static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, struct sock *sk, __be32 daddr, __be32 saddr, @@ -372,18 +373,22 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } +//查找邻居项 static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, __be32 daddr) { struct neighbour *neigh; + //在邻居表中查询邻居 neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) + //创建新的邻居表项 neigh = __neigh_create(&arp_tbl, &daddr, dev, false); return neigh; } +//查找下一跳 static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, struct sk_buff *skb, bool *is_v6gw) @@ -392,6 +397,7 @@ static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, struct neighbour *neigh; if (likely(rt->rt_gw_family == AF_INET)) { + //ipv4 neigh = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &rt->rt_gw6); diff --git a/linux-6.2/include/net/sock.h b/linux-6.2/include/net/sock.h index 7019e6591..5055bcd66 100644 --- a/linux-6.2/include/net/sock.h +++ b/linux-6.2/include/net/sock.h @@ -176,7 +176,9 @@ struct sock_common { union { __portpair skc_portpair; struct { + //目的端口 __be16 skc_dport; + //本地端口 __u16 skc_num; }; }; @@ -452,6 +454,7 @@ struct sock { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; + //发送缓冲队列 struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; @@ -1175,6 +1178,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) #endif } +//wait_woken 进入睡眠 #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc; \ release_sock(__sk); \ @@ -1327,6 +1331,7 @@ struct proto { u32 sysctl_rmem_offset; int max_header; + //不用自动绑定 bool no_autobind; struct kmem_cache *slab; @@ -2529,6 +2534,7 @@ static inline void sk_set_bit(int nr, struct sock *sk) set_bit(nr, &sk->sk_wq_raw->flags); } +//清除相应标志 static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && @@ -2619,6 +2625,7 @@ static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) return noblock ? 0 : sk->sk_rcvtimeo; } +//检查模式,并返回socket发送的超时时间 static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_sndtimeo; diff --git a/linux-6.2/include/net/tcp.h b/linux-6.2/include/net/tcp.h index db9f828e9..ae18c13c8 100644 --- a/linux-6.2/include/net/tcp.h +++ b/linux-6.2/include/net/tcp.h @@ -1856,7 +1856,7 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } - +//获取发送队列的最后一个skb static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); diff --git a/linux-6.2/init/main.c b/linux-6.2/init/main.c index e1c3911d7..8a7961e02 100644 --- a/linux-6.2/init/main.c +++ b/linux-6.2/init/main.c @@ -1413,7 +1413,7 @@ static void __init do_basic_setup(void) do_ctors(); do_initcalls(); } - +//early_initcall 调用位置 static void __init do_pre_smp_initcalls(void) { initcall_entry_t *fn; diff --git a/linux-6.2/kernel/sched/wait.c b/linux-6.2/kernel/sched/wait.c index 806e1b2bb..136671c97 100644 --- a/linux-6.2/kernel/sched/wait.c +++ b/linux-6.2/kernel/sched/wait.c @@ -21,6 +21,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); + //进一步执行 __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } diff --git a/linux-6.2/lib/iov_iter.c b/linux-6.2/lib/iov_iter.c index f9a3ff37e..b17f35579 100644 --- a/linux-6.2/lib/iov_iter.c +++ b/linux-6.2/lib/iov_iter.c @@ -421,6 +421,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) } EXPORT_SYMBOL(fault_in_iov_iter_writeable); +//iov初始化 void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) @@ -1861,7 +1862,7 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); - +//初始化iovc结构与iov_iter结构 int import_single_range(int rw, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i) { @@ -1869,9 +1870,10 @@ int import_single_range(int rw, void __user *buf, size_t len, len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; - + // iov->iov_base = buf; iov->iov_len = len; + //初始化iov_iter结构 iov_iter_init(i, rw, iov, 1, len); return 0; } diff --git a/linux-6.2/net/core/dev.c b/linux-6.2/net/core/dev.c index 1e774e401..ffb087b55 100644 --- a/linux-6.2/net/core/dev.c +++ b/linux-6.2/net/core/dev.c @@ -3067,16 +3067,19 @@ int netif_get_num_default_rss_queues(void) } EXPORT_SYMBOL(netif_get_num_default_rss_queues); +//发出软中断 static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); + //将队列发送给softnet_data的output_queue_tailp中 sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; + //发出软中断 raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -3578,6 +3581,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } EXPORT_SYMBOL(netif_skb_features); +//发送一个skb static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { @@ -3589,12 +3593,14 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, len = skb->len; trace_net_dev_start_xmit(skb, dev); + //发送逻辑 rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } +//发送函数 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { @@ -3603,9 +3609,11 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de while (skb) { struct sk_buff *next = skb->next; - + //从链表中移除 skb_mark_not_on_list(skb); + //调用 xmit_one 函数发送当前的数据包 skb rc = xmit_one(skb, dev, txq, next != NULL); + //发送完成 if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; @@ -3790,6 +3798,7 @@ static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, return rc; } +//将数据包提交到队列进行传输 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) @@ -3801,13 +3810,16 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); + //队列是否标记为不需要加锁 if (q->flags & TCQ_F_NOLOCK) { + //绕开排队系统 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { + //调用 dev_qdisc_enqueue 函数将数据包排入队列 rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); @@ -3818,13 +3830,15 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) + //执行队列的调度操作 __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } - + //正常排队 rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + //开始发送 qdisc_run(q); no_lock_out: @@ -4161,6 +4175,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ +//设备子系统内部函数 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; @@ -4169,6 +4184,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) int rc = -ENOMEM; bool again = false; + //设置mac头 skb_reset_mac_header(skb); skb_assert_len(skb); @@ -4186,6 +4202,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; #endif +//CONFIG_NET_EGRESS 配置 #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -4209,18 +4226,24 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ + //释放目标地址 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); + //传输队列为空,则调用 netdev_core_pick_tx来选择合适的传输队列 if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); + //传输队列 txq 的队列调度器 q = rcu_dereference_bh(txq->qdisc); + //跟踪函数 trace_net_dev_queue(skb); + //检查 qdisc 是否有 enqueue 操作 没有则是回环/隧道设备 if (q->enqueue) { + //将数据包 skb 添加到队列中 rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } @@ -4237,6 +4260,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ + //设备是否处于启用状态 if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ @@ -4255,6 +4279,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); + //发送函数 skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { @@ -4679,6 +4704,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: + //将数据放到softnet_data的队列中 __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); rps_unlock_irq_restore(sd, &flags); @@ -5066,8 +5092,10 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); +//发包软中断函数 static __latent_entropy void net_tx_action(struct softirq_action *h) { + //获取softnet_data结构 struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { @@ -5097,6 +5125,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) } } + //output_queue上有数据 if (sd->output_queue) { struct Qdisc *head; @@ -5107,7 +5136,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) local_irq_enable(); rcu_read_lock(); - + //遍历qdisc while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; @@ -5137,6 +5166,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) } clear_bit(__QDISC_STATE_SCHED, &q->state); + //发送数据 qdisc_run(q); if (root_lock) spin_unlock(root_lock); @@ -5401,7 +5431,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, if (pfmemalloc) goto skip_taps; - //遍历ptype_all链表的每个对象,进行执行 (tcpdump?) + //遍历ptype_all链表的每个对象,进行执行 (tcpdump) list_for_each_entry_rcu(ptype, &ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); @@ -6030,6 +6060,7 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) #endif } +//本地网络收包软中断处理函数 static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); @@ -6048,8 +6079,10 @@ static int process_backlog(struct napi_struct *napi, int quota) while (again) { struct sk_buff *skb; + //从process_queue中取出数据包 while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); + //处理 __netif_receive_skb(skb); rcu_read_unlock(); input_queue_head_incr(sd); @@ -6071,6 +6104,7 @@ static int process_backlog(struct napi_struct *napi, int quota) napi->state = 0; again = false; } else { + //将链表中的数据包移动到process_queue中 skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } diff --git a/linux-6.2/net/core/neighbour.c b/linux-6.2/net/core/neighbour.c index 4edd2176e..755f71236 100644 --- a/linux-6.2/net/core/neighbour.c +++ b/linux-6.2/net/core/neighbour.c @@ -658,16 +658,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); +//创建邻居表项 static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) -{ +{ u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; + //申请邻居表项 n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { @@ -675,11 +677,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out; } + //复制 IP 地址,绑定网卡 memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ + //constructor调用arp_constructor if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; @@ -694,6 +698,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, } /* Device specific setup. */ + //赋值 if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); @@ -736,6 +741,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); + //添加到邻居哈希表中 rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); @@ -1559,19 +1565,19 @@ static void neigh_hh_init(struct neighbour *n) } /* Slow and careful. */ - +//邻居层内部函数 int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; - + //发送arp请求 if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; - + //初始化二层头缓存 if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); - + //添加二层头 do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); @@ -1580,6 +1586,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) + //发送数据包: rc = dev_queue_xmit(skb); else goto out_kfree_skb; diff --git a/linux-6.2/net/core/sock.c b/linux-6.2/net/core/sock.c index 74c46d40f..1497dfbac 100644 --- a/linux-6.2/net/core/sock.c +++ b/linux-6.2/net/core/sock.c @@ -588,10 +588,12 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); +//检查sk中的路由信息 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); + //执行ipv4_dst_check 检查 if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { @@ -2757,12 +2759,14 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, } EXPORT_SYMBOL(sock_alloc_send_pskb); +//发送控制信息 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; - + //判断类型 switch (cmsg->cmsg_type) { + //设置数据包的标记 case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) @@ -2771,6 +2775,7 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; + //设置时间戳标志 case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2782,6 +2787,7 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + //设置网络包的预定发送时间 case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; @@ -2800,17 +2806,20 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, } EXPORT_SYMBOL(__sock_cmsg_send); +//发送控制信息 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; + //遍历控制信息 for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; + //发送控制消息 ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; @@ -2969,10 +2978,10 @@ EXPORT_SYMBOL_GPL(__sk_flush_backlog); //没有接收到足够的数据,进行阻塞 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { - //绑定回调函数 + //绑定回调函数 woken_wake_function置为wait.func DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; - //当前进程关联到等待队列上 + //当前进程关联到等待队列上 sk_sleep(sk)获取sock对象的等待队列表头 add_wait_queue(sk_sleep(sk), &wait); //设置进程状态 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); diff --git a/linux-6.2/net/core/stream.c b/linux-6.2/net/core/stream.c index 434446ab1..ff037c0dc 100644 --- a/linux-6.2/net/core/stream.c +++ b/linux-6.2/net/core/stream.c @@ -53,8 +53,10 @@ void sk_stream_write_space(struct sock *sk) * * Must be called with the socket locked. */ +//等待连接建立 int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { + //定义等待队列、注册回调函数、关联进程描述符 DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; int done; @@ -63,6 +65,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int err = sock_error(sk); if (err) return err; + // 如果当前套接字状态不是 SYN_SENT 或 SYN_RECV,说明连接已建立或关闭 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -70,13 +73,18 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); + //当前进程添加到sk的等待队列 add_wait_queue(sk_sleep(sk), &wait); + //表示套接字有待处理的写操作 sk->sk_write_pending++; + //睡眠 done = sk_wait_event(sk, timeo_p, !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); + // 移除当前进程的等待队列 remove_wait_queue(sk_sleep(sk), &wait); + // 写操作处理完成 sk->sk_write_pending--; } while (!done); return 0; diff --git a/linux-6.2/net/ipv4/af_inet.c b/linux-6.2/net/ipv4/af_inet.c index 424e6d804..5f296d47a 100644 --- a/linux-6.2/net/ipv4/af_inet.c +++ b/linux-6.2/net/ipv4/af_inet.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL(inet_sock_destruct); /* * Automatically bind an unbound socket. */ - +//自动绑定本地端口 static int inet_autobind(struct sock *sk) { struct inet_sock *inet; @@ -177,10 +177,12 @@ static int inet_autobind(struct sock *sk) lock_sock(sk); inet = inet_sk(sk); if (!inet->inet_num) { + //分配一个可用端口 其在tcp_prot中被定义为inet_csk_get_port if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } + //绑定本地端口 inet->inet_sport = htons(inet->inet_num); } release_sock(sk); @@ -324,7 +326,7 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, WARN_ON(!answer_prot->slab); err = -ENOMEM; - //分配sk结构,并将tcp->prot赋值给sk->sk_prot上 + //分配sk结构,并将tcp_prot赋值给sk->sk_prot上 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -817,11 +819,16 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_getname); +//绑定本地端口 int inet_send_prepare(struct sock *sk) { + //RPS sock_rps_record_flow(sk); /* We may need to bind the socket. */ + //inet_sk(sk)->inet_num是本地端口号,此处标识还没有本地端口 + //!sk->sk_prot->no_autobind 协议支持自动绑定 + //inet_autobind 进行绑定 if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && inet_autobind(sk)) return -EAGAIN; @@ -830,10 +837,11 @@ int inet_send_prepare(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_send_prepare); +//发送数据 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; - + //如果没有绑定本地端口,则绑定 if (unlikely(inet_send_prepare(sk))) return -EAGAIN; @@ -1984,7 +1992,7 @@ static int __init inet_init(void) * Tell SOCKET that we are alive... */ - //注册了一个套接字操作集 + //注册了一个套接字操作集 用户申请套接字时实际调用 (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL diff --git a/linux-6.2/net/ipv4/arp.c b/linux-6.2/net/ipv4/arp.c index 4f7237661..4b51e0856 100644 --- a/linux-6.2/net/ipv4/arp.c +++ b/linux-6.2/net/ipv4/arp.c @@ -220,6 +220,7 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) return neigh_key_eq32(neigh, pkey); } +//初始化arp表项 static int arp_constructor(struct neighbour *neigh) { __be32 addr; diff --git a/linux-6.2/net/ipv4/inet_connection_sock.c b/linux-6.2/net/ipv4/inet_connection_sock.c index f2c43f671..8323544a9 100644 --- a/linux-6.2/net/ipv4/inet_connection_sock.c +++ b/linux-6.2/net/ipv4/inet_connection_sock.c @@ -298,6 +298,7 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ +//寻找一个未被占用的端口 static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, @@ -316,7 +317,9 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: + //获取当前进程允许使用的本地端口范围 inet_get_local_port_range(net, &low, &high); + //将其变为开区间 high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; @@ -324,42 +327,56 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) + //取前一半 high = half; else low = half; } + //剩下的大小 remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; + //随机生成一个offset offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ + // offset 调整为一个偶数。 offset |= 1U; other_parity_scan: + //计算出端口 port = low + offset; + //逐步递增端口来检查一系列的端口 for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; + //是否是系统保留的本地端口 if (inet_is_local_reserved_port(net, port)) continue; + //据端口计算哈希值,并找到对应的哈希桶。 + //哈希桶用于存储绑定到该端口的 inet_bind_bucket head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); + //是否使用了 bhash2 if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } - + //计算与当前端口关联的第二级哈希桶 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); + //查找与当前端口相关联的 inet_bind2_bucket tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); + //遍历与当前端口绑定的所有 inet_bind_bucket。通过 tb 指针访问每个桶 inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { + //没有与已绑定的 inet_bind_bucket 冲突 if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) + //成功找到 goto success; spin_unlock(&head2->lock); goto next_port; @@ -483,6 +500,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ +//分配端口 int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); @@ -498,7 +516,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) l3mdev = inet_sk_bound_l3mdev(sk); + //未指定端口 if (!port) { + //查找可用端口 head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; @@ -508,6 +528,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) if (tb && tb2) goto success; found_port = true; + //指定端口 } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/linux-6.2/net/ipv4/ip_output.c b/linux-6.2/net/ipv4/ip_output.c index 922c87ef1..7e35d6a55 100644 --- a/linux-6.2/net/ipv4/ip_output.c +++ b/linux-6.2/net/ipv4/ip_output.c @@ -96,27 +96,33 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); +// int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); - + //更新长度 iph->tot_len = htons(skb->len); + //填充效验和 ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ + //处理l3主设备 VRF(虚拟路由) skb = l3mdev_ip_out(sk, skb); if (unlikely(!skb)) return 0; + //设置 skb->protocol skb->protocol = htons(ETH_P_IP); + //NF_INET_LOCAL_OUT 挂载点 回调dst_output return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); } +//处理ip数据包 int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; @@ -191,6 +197,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); +//ip层内部函数 static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -200,17 +207,19 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s struct neighbour *neigh; bool is_v6gw = false; + //多播/广播 if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); + //扩充二层头部所需空间 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } - + //LWT 隧道封装 if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); @@ -219,12 +228,14 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } rcu_read_lock_bh(); + //查找下一跳 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ + //邻居子系统进入 res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock_bh(); return res; @@ -246,7 +257,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: seglen is <= mtu */ - if (skb_gso_validate_network_len(skb, mtu)) + if (skb_gso_validate_neip_finish_output2twork_len(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length exceeds the egress MTU. @@ -288,7 +299,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - + //据包经过 SNAT 变更,可能需要 重新选路 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -296,23 +307,28 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff * return dst_output(net, sk, skb); } #endif + //获取 MTU mtu = ip_skb_dst_mtu(sk, skb); + //判断 GSO if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - + //需要 IP 分片 if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - + //进一步处理 return ip_finish_output2(net, sk, skb); } +//ip层内部函数 static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; + //BPF_PROG_TYPE_CGROUP_SKB 执行 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: + //继续发送 return __ip_finish_output(net, sk, skb); case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; @@ -418,15 +434,19 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } +//ip层内部函数 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + //更新 IP 统计数据 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); - + + //设置dev 与 protocol skb->dev = dev; skb->protocol = htons(ETH_P_IP); + //NF_INET_POST_ROUTING 回调ip_finish_output return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip_finish_output, @@ -450,6 +470,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ +//网络层发包 int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos) { @@ -467,12 +488,14 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; + //获取路由缓存 rt = skb_rtable(skb); if (rt) goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); + //路由查找 if (!rt) { __be32 daddr; @@ -485,6 +508,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, * keep trying until route appears or the connection times * itself out. */ + //查找路由 rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, @@ -494,17 +518,22 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; + //设置sk中的结构 sk_setup_caps(sk, &rt->dst); } + //将路由信息绑定到 skb skb_dst_set_noref(skb, &rt->dst); packet_routed: + //严格路由选项 &&无路由严格路由选项 &&无路由 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ + //添加ip头部 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); + //构造ip头 iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) @@ -517,18 +546,19 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, /* Transport layer set skb->h.foo itself. */ + //构造ip选项 if (inet_opt && inet_opt->opt.optlen) { iph->ihl += inet_opt->opt.optlen >> 2; ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt); } - + //设置id ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - + //下一步执行 res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; @@ -541,6 +571,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, } EXPORT_SYMBOL(__ip_queue_xmit); +//网络层发包入口函数 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); diff --git a/linux-6.2/net/ipv4/route.c b/linux-6.2/net/ipv4/route.c index de6e3515a..7b46571b7 100644 --- a/linux-6.2/net/ipv4/route.c +++ b/linux-6.2/net/ipv4/route.c @@ -2647,6 +2647,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); +//查找路由 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) @@ -2750,6 +2751,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, goto make_route; } + //核心函数 err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; diff --git a/linux-6.2/net/ipv4/tcp.c b/linux-6.2/net/ipv4/tcp.c index 9c298a167..4a6bd1909 100644 --- a/linux-6.2/net/ipv4/tcp.c +++ b/linux-6.2/net/ipv4/tcp.c @@ -696,13 +696,14 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } - -void tcp_push(struct sock *sk, int flags, int mss_now, +//发送数据 +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + //获取当前待发送的 skb skb = tcp_write_queue_tail(sk); if (!skb) return; @@ -728,6 +729,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now, if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; + //推送待发送的数据 __tcp_push_pending_frames(sk, mss_now, nonagle); } @@ -1165,6 +1167,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } } +//TFO机制,在SYN中直接携带数据来加快TCP连接 int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { @@ -1180,16 +1183,20 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ - + + //分配TFO结构 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; + //记录要发送报告的数据和零拷贝信息 tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; + //处理延迟连接 if (inet->defer_connect) { + //建立连接 err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { @@ -1199,11 +1206,13 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; + //直接尝试连接,并在 SYN 中附加数据 err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ + //释放结构 if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); @@ -1211,9 +1220,10 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, } return err; } - +//tcp发送数据内部函数 分配新skb挂在发送队列中 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + //sock和tcp_sock可以相互转化 tcp_sock中有独有的信息 struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1224,27 +1234,40 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) bool zc = false; long timeo; + //读取flags flags = msg->msg_flags; + //零拷贝逻辑 数据直接从用户态传输到 socket 发送队列,避免用户态和内核态拷贝 + //这里进行检查 将标志位zc置位 if ((flags & MSG_ZEROCOPY) && size) { + //获取 TCP 发送队列的最后一个 skb skb = tcp_write_queue_tail(sk); + //是否已有用户缓冲区 if (msg->msg_ubuf) { uarg = msg->msg_ubuf; net_zcopy_get(uarg); + //支持聚合机制 zc = sk->sk_route_caps & NETIF_F_SG; + //SOCK_ZEROCOPY置位 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { + //为用户数据分配新的 uarg uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } + //支持聚合机制 zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) + //不支持 清除标志位 uarg_to_msgzc(uarg)->zerocopy = 0; } } - + // 检查TCP Fast Open (TFO) 和延迟连接 (defer_connect) + //TFO是减少一次往返来加快TCP连接(在SYN中直接携带数据) + //延迟连接 避免connect时触发握手,等实际发送时触发 + //repair if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); @@ -1256,21 +1279,26 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + //监测应用程序是否限制了发送速率 tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ + //当前套接字状态不为已建立或者关闭等待,且没有启用 TFO,被动端的连接需要等待建立完成 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { + //等待建立 err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } + //需要修复 if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { + //将数据直接插入到接收队列 copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } @@ -1282,8 +1310,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } + //初始化sockcm_init结构 sockcm_init(&sockc, sk); + //检查控制数据部分的长度 if (msg->msg_controllen) { + //发送控制消息 err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { err = -EINVAL; @@ -1292,25 +1323,29 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ + //相应判断 进行清除状态 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: + // 获取当前 MSS mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - + //循环发送数据 while (msg_data_left(msg)) { int copy = 0; - + //获取发送队列的尾部数据包 skb = tcp_write_queue_tail(sk); if (skb) + //剩余容量 copy = size_goal - skb->len; + //容量不足 /无法合并 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1323,14 +1358,17 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (sk_flush_backlog(sk)) goto restart; } + //检查重传队列和写队列是否为空 first_skb = tcp_rtx_and_write_queues_empty(sk); + //分配一个新的 skb skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; - + + //将 skb 添加到 TCP 发送队列中 tcp_skb_entail(sk, skb); copy = size_goal; @@ -1345,9 +1383,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); - + + //不能零拷贝 if (!zc) { bool merge = true; + //获取 skb 中当前的片段数量 int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1375,6 +1415,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (!copy) goto wait_for_space; + //将数据从消息缓冲区复制到页面片段中 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, @@ -1383,6 +1424,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto do_error; /* Update the skb. */ + //合并成功 if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { @@ -1404,6 +1446,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto wait_for_space; } + //将数据从用户空间拷贝到内核的 skb 中 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); @@ -1417,11 +1460,13 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + //更新 write_seq WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; + //检查消息中是否还有未处理的数据 if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; @@ -1431,6 +1476,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; + //强制推送数据 + //未发送的数据超过最大窗口的一半 if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); @@ -1441,6 +1488,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) + //推送已经拷贝的数据 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1477,6 +1525,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); +//tcp发送数据入口函数 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; diff --git a/linux-6.2/net/ipv4/tcp_input.c b/linux-6.2/net/ipv4/tcp_input.c index 284469e80..20e5e0998 100644 --- a/linux-6.2/net/ipv4/tcp_input.c +++ b/linux-6.2/net/ipv4/tcp_input.c @@ -4984,6 +4984,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) goto err_free; } + //将用户空间的数据复制到内核空间的缓冲区 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) goto err_free; @@ -4992,6 +4993,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; + //调用 tcp_queue_rcv 将数据包(skb)添加到接收队列 if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); diff --git a/linux-6.2/net/ipv4/tcp_output.c b/linux-6.2/net/ipv4/tcp_output.c index 71d01cf3c..d9be361ee 100644 --- a/linux-6.2/net/ipv4/tcp_output.c +++ b/linux-6.2/net/ipv4/tcp_output.c @@ -60,7 +60,6 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); - /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -1231,6 +1230,10 @@ INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ +/*复制或者拷贝skb,构造skb中的tcp首部,并将调用网络层的发送函数发送skb; +在发送前,首先需要克隆或者复制skb,因为在成功发送到网络设备之后, +skb会释放,而tcp层不能真正的释放,是需要等到对该数据段的ack才可以释放; +然后构造tcp首部和选项;最后调用网络层提供的发送回调函数发送skb*/ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { @@ -1251,13 +1254,16 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + //需要克隆 if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) + //克隆过进行复制 skb = pskb_copy(oskb, gfp_mask); else + //进行克隆 skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); @@ -1272,10 +1278,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - + //计算SYN包tcp选项的长度 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); } else { + //计算已连接状态的tcp选项长度 tcp_options_size = tcp_established_options(sk, skb, &opts, &md5); /* Force a PSH flag on all (GSO) packets to expedite GRO flush @@ -1289,6 +1296,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } + //tcp头部长度 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* if no packet is in qdisc/device queue, then allow XPS to select @@ -1307,7 +1315,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ skb->pfmemalloc = 0; + //在skb前加入tcp头 skb_push(skb, tcp_header_size); + //设置skb中传输层头部偏移 skb_reset_transport_header(skb); skb_orphan(skb); @@ -1318,6 +1328,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); /* Build TCP header and checksum it. */ + //构造tcp头部 th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; @@ -1350,7 +1361,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ th->window = htons(min(tp->rcv_wnd, 65535U)); } - + //写入tcp选项 tcp_options_write(th, tp, &opts); #ifdef CONFIG_TCP_MD5SIG @@ -1363,8 +1374,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, #endif /* BPF prog is the last one writing header option */ + //BPF 程序钩子,用于修改 TCP 头部选项 bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + //相应的检查函数 INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -1394,8 +1407,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + //记录发包时间 到skb->skb_mstamp_ns tcp_add_tx_delay(skb, tp); + //相应的下一层函数 err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); @@ -2598,6 +2613,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ +//tcp 发送 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { @@ -2610,10 +2626,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, u32 max_segs; sent_pkts = 0; - + //刷新 tcp_sock 的时间戳信息 tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ + //探测网络路径的最大传输单元MTU result = tcp_mtu_probe(sk); if (!result) { return false; @@ -2621,11 +2638,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 1; } } - + //最大分段数 max_segs = tcp_tso_segs(sk, mss_now); + //循环获取待发送skb 获取队列中第一个数据包 while ((skb = tcp_send_head(sk))) { unsigned int limit; - + //修复模式 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; @@ -2634,13 +2652,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } - + //TCP 流量控制 if (tcp_pacing_check(sk)) break; - + //分段数 tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); - + //滑动窗口相关 测试是否有足够的拥塞窗口空间 cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { if (push_one == 2) @@ -2649,18 +2667,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } - + //是否存在接收窗口限制 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { + //Nagle 算法 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { + //是否应该延迟分段的发 if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) @@ -2676,6 +2696,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && + // 对数据包进行分段处理 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; @@ -2689,7 +2710,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; - + //发送skb if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2864,6 +2885,7 @@ void tcp_send_loss_probe(struct sock *sk) * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ +//检查套接字状态 发送数据 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { @@ -2873,7 +2895,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; - + //发送数据 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); diff --git a/linux-6.2/net/ipv4/tcp_rate.c b/linux-6.2/net/ipv4/tcp_rate.c index a8f6d9d06..09bad3a54 100644 --- a/linux-6.2/net/ipv4/tcp_rate.c +++ b/linux-6.2/net/ipv4/tcp_rate.c @@ -191,6 +191,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, } /* If a gap is detected between sends, mark the socket application-limited. */ +//监测应用程序是否限制了发送速率 void tcp_rate_check_app_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -200,6 +201,7 @@ void tcp_rate_check_app_limited(struct sock *sk) /* Nothing in sending host's qdisc queues or NIC tx queue. */ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && /* We are not limited by CWND. */ + //检查当前的发送窗口是否限制了发送速率 tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && /* All lost packets have been retransmitted. */ tp->lost_out <= tp->retrans_out) diff --git a/linux-6.2/net/sched/sch_generic.c b/linux-6.2/net/sched/sch_generic.c index a9aadc4e6..f7503907e 100644 --- a/linux-6.2/net/sched/sch_generic.c +++ b/linux-6.2/net/sched/sch_generic.c @@ -311,6 +311,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, * false - hardware queue frozen backoff * true - feel free to send more pkts */ +//网络设备子系统函数发送数据包 bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) @@ -326,7 +327,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, if (validate) skb = validate_xmit_skb_list(skb, dev, &again); -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); @@ -339,6 +340,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) + //发送 skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); @@ -385,6 +387,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ +//发送数据 static inline bool qdisc_restart(struct Qdisc *q, int *packets) { spinlock_t *root_lock = NULL; @@ -394,6 +397,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) bool validate; /* Dequeue packet */ + //从队列中取包 skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return false; @@ -401,12 +405,15 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); + //获取发送队列: dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); + //发送数据包: return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } +//调度队列中的数据包 void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(dev_tx_weight); @@ -414,6 +421,7 @@ void __qdisc_run(struct Qdisc *q) while (qdisc_restart(q, &packets)) { quota -= packets; + //配额用完 触发软中断 if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); diff --git a/linux-6.2/net/socket.c b/linux-6.2/net/socket.c index 7ff123b11..bc8632558 100644 --- a/linux-6.2/net/socket.c +++ b/linux-6.2/net/socket.c @@ -239,13 +239,14 @@ static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; * too long an error code of -EINVAL is returned. If the copy gives * invalid addresses -EFAULT is returned. On a success 0 is returned. */ - +//从用户空间读取地址 int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr) { if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) return -EINVAL; if (ulen == 0) return 0; + //实际读取操作 if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; return audit_sockaddr(ulen, kaddr); @@ -545,13 +546,17 @@ EXPORT_SYMBOL(sockfd_lookup); static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { + //获取fd结构 struct fd f = fdget(fd); struct socket *sock; *err = -EBADF; + //文件存在 if (f.file) { + //获取socket结构 sock = sock_from_file(f.file); if (likely(sock)) { + //检查f.flags是否有FDPUT_FPUT位 *fput_needed = f.flags & FDPUT_FPUT; return sock; } @@ -713,8 +718,10 @@ INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); +//发送信息 static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { + //调用sock->ops->sendmsg,其socket创建时被赋为inet_sendmsg int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); @@ -730,11 +737,13 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) * Sends @msg through @sock, passing through LSM. * Returns the number of bytes sent, or an error code. */ +//通过sock结构发送msg int sock_sendmsg(struct socket *sock, struct msghdr *msg) { + //安全检查 开启CONFIG_SECURITY_NETWORK配置时 遍历hook表并执行 int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); - + //err为0执行sock_sendmsg_nosec return err ?: sock_sendmsg_nosec(sock, msg); } EXPORT_SYMBOL(sock_sendmsg); @@ -2117,38 +2126,47 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, * space and check the user space data area is readable before invoking * the protocol. */ +//初始化msg 查找socket结构 int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; int err; + //msghdr struct msghdr msg; struct iovec iov; int fput_needed; - + //初始化iov 与 msg.msg_iter err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); if (unlikely(err)) return err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); + //根据fd找到对应的socket结构,判断是否有FDPUT_FPUT位 + //其中记录着各种协议栈的函数地址 +, sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; + //初始化msg相应结构 msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; + //存储对端地址信息 if (addr) { + //将地址复制到内核 err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } + //非阻塞模式 if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; + //调用sock_sendmsg 发送msg err = sock_sendmsg(sock, &msg); out_put: diff --git a/linux-6.2/security/security.c b/linux-6.2/security/security.c index d1571900a..dd2a51525 100644 --- a/linux-6.2/security/security.c +++ b/linux-6.2/security/security.c @@ -762,6 +762,7 @@ static int lsm_superblock_alloc(struct super_block *sb) P->hook.FUNC(__VA_ARGS__); \ } while (0) +//遍历注册到 security_hook_heads.FUNC 链表中的每个安全 hook 调用其对应函数 P->hook.FUNC #define call_int_hook(FUNC, IRC, ...) ({ \ int RC = IRC; \ do { \