欢迎光临
我们一直在努力

dpvs学习笔记: 12 TOA 实现原理

在 full-nat two-arm 模式下,后端 real server 获取到请求的来源都是 dpvs local ip, 如何获取真实的 client ip 呢?这就需要 toa 模块,原理都说是修改了 rs 机器获取 ip 的函数,具体如何初现呢?

tcp option 字段

关于 tcp header 可以参考 wiki, 我把截图贴上来

tcp header

我们知道 ip header 里 src address 肯定是 dpvs local ip, 否则数据包无法发送。那么 client ip 放哪里呢?就是在 tcp header 的 option 字段中。 

option 字段最长 40 bytes. 每填充一个选项由三部分构成:op-kind, op-length, op-data. 最常用的 mss 字段就是放在 option 里。只要构建一个不冲突的 op-kind 就可以把 client ip 填充进去。ipv4 的度度是 4 bytes, ipv6 是 16 bytes. 看来整个 option 字段在不久就会不够用。

dpvs 写 tcp option address

DPVS fullnat 在调用 tcp_fnat_in_handler 时会调用 tcp_in_add_toa 写到 mbuf.

static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf,
                          struct tcphdr *tcph)
{
    uint32_t mtu;
    struct tcpopt_addr *toa;
    uint32_t tcp_opt_len;

    uint8_t *p, *q, *tail;
    struct route_entry *rt;

    if (unlikely(conn->af != AF_INET && conn->af != AF_INET6))
        return EDPVS_NOTSUPP;

    tcp_opt_len = conn->af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR;
    /*
     * check if we can add the new option
     */
    /* skb length and tcp option length checking */
    if ((rt = mbuf->userdata) != NULL) {
        mtu = rt->mtu;
    } else if (conn->in_dev) { /* no route for fast-xmit */
        mtu = conn->in_dev->mtu;
    } else {
        RTE_LOG(DEBUG, IPVS, "add toa: MTU unknown.\n");
        return EDPVS_NOROUTE;
    }

    if (unlikely(mbuf->pkt_len > (mtu - tcp_opt_len))) {
        RTE_LOG(DEBUG, IPVS, "add toa: need fragment, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_FRAG;
    }

    /* maximum TCP header is 60, and 40 for options */
    if (unlikely((60 - (tcph->doff << 2)) < tcp_opt_len)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /* check tail room and expand mbuf.
     * have to pull all bits in segments for later operation. */
    if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0))
        return EDPVS_INVPKT;
    tail = (uint8_t *)rte_pktmbuf_append(mbuf, tcp_opt_len);
    if (unlikely(!tail)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /*
     * now add address option
     */

    /* move data down, including existing tcp options
     * @p is last data byte,
     * @q is new position of last data byte */
    p = tail - 1;
    q = p + tcp_opt_len;
    while (p >= ((uint8_t *)tcph + sizeof(struct tcphdr))) {
        *q = *p;
        p--, q--;
    }

    /* insert toa right after TCP basic header */
    toa = (struct tcpopt_addr *)(tcph + 1);
    toa->opcode = TCP_OPT_ADDR;
    toa->opsize = tcp_opt_len;
    toa->port = conn->cport;

    if (conn->af == AF_INET) {
        struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1);
        toa_ip4->addr = conn->caddr.in;
    }
    else {
        struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1);
        toa_ip6->addr = conn->caddr.in6;
    }


    /* reset tcp header length */
    tcph->doff += tcp_opt_len >> 2;

    /* reset ip header total length */
    if (conn->af == AF_INET)
        ip4_hdr(mbuf)->total_length =
            htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len);
    else
        ip6_hdr(mbuf)->ip6_plen =
            htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len);

    /* tcp csum will be recalc later, 
     * so as IP hdr csum since iph.tot_len has been chagned. */
    return EDPVS_OK;
}
  1. 根据 ipv4 ipv6 来确定 toa 需要的长度,2 bytes op-kind, 2 bytes op-length 再加上地址长度。所以 ipv4 共需 8 bytes, ipv6 共需 20 bytes
  2. TCP header 最大长度 60,option 最大长度 40,确何不会超过
  3. rte_pktmbuf_append 将 mbuf 扩展空间,能容纳 toa
  4. 填充 tcpopt_addr 结构体,op-kind TCP_OPT_ADDR 是 254,非官方 tcp/ip 认可的值。端口值是 conn->cport, 最后填充 conn->caddr.in 或 conn->caddr.in6 地址。

real server 安装 toa

很简单,make 编绎后生成 toa.ko 驱动,然后 insmod toa.ko 即可。所有 real server 都需要安装。先看下 module_init 函数 toa_init

static int __init
toa_init(void)
{

    TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n");

    /* alloc statistics array for toa */
    ext_stats = alloc_percpu(struct toa_stat_mib);
    if (NULL == ext_stats)
        return 1;
    proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops);

    /* get the address of function sock_def_readable
     * so later we can know whether the sock is for rpc, tux or others
     */
    sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable");
    TOA_INFO("CPU [%u] sk_data_ready_addr = "
        "kallsyms_lookup_name(sock_def_readable) = %lu\n",
         smp_processor_id(), sk_data_ready_addr);
    if (0 == sk_data_ready_addr) {
        TOA_INFO("cannot find sock_def_readable.\n");
        goto err;
    }

#ifdef TOA_IPV6_ENABLE
    if (0 != get_kernel_ipv6_symbol()) {
        TOA_INFO("get ipv6 struct from kernel fail.\n");
        goto err;
    }
#endif
    
    /* hook funcs for parse and get toa */
    hook_toa_functions();

    TOA_INFO("toa loaded\n");
    return 0;

err:
    proc_net_remove(&init_net, "toa_stats");
    if (NULL != ext_stats) {
        free_percpu(ext_stats);
        ext_stats = NULL;
    }

    return 1;
}
  1. proc_net_fops_create 在 /proc 文件系统下注册 /proc/net/toa_stats 用于查看统计使用
  2. kallsyms_lookup_name 根据名称来获取 sock_def_readable 地址
  3. get_kernel_ipv6_symbol 如果支持 ipv6, 获取相应的回调函数地址
  4. hook_toa_functions 将 toa 功能 hook 进内核
    proc_net_fops_create

/* replace the functions with our functions */
static inline int
hook_toa_functions(void)
{
    /* hook inet_getname for ipv4 */
    struct proto_ops *inet_stream_ops_p =
            (struct proto_ops *)&inet_stream_ops;
    /* hook tcp_v4_syn_recv_sock for ipv4 */
    struct inet_connection_sock_af_ops *ipv4_specific_p =
            (struct inet_connection_sock_af_ops *)&ipv4_specific;

    inet_stream_ops_p->getname = inet_getname_toa;
    TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n",
        smp_processor_id(), inet_getname, inet_stream_ops_p->getname);

    ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v4_syn_recv_sock,
        ipv4_specific_p->syn_recv_sock);

#ifdef TOA_IPV6_ENABLE
    inet6_stream_ops_p->getname = inet6_getname_toa;
    TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n",
        smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname);

    ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v6_syn_recv_sock_org_pt,
        ipv6_specific_p->syn_recv_sock);
#endif

    return 0;
}

仔细看看也不难,就是将 inet ops 回调函数 getname 替换为 toa 的。但是我有问题,如果请求不来自 dpvs,普通的请求会不会也受影响?

可以看到 hook 了两个函数 tcp_v4_syn_recv_sock_toa 和 inet_getname_toa

real server 获取 client ip

当完成三次握手时调用 tcp_v4_syn_recv_sock_toa

static struct sock *
tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb,
            struct request_sock *req, struct dst_entry *dst)
{
    struct sock *newsock = NULL;

    TOA_DBG("tcp_v4_syn_recv_sock_toa called\n");

    /* call orginal one */
    newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst);

    /* set our value if need */
    if (NULL != newsock && NULL == newsock->sk_user_data) {
        newsock->sk_user_data = get_toa_data(AF_INET, skb);
        if (NULL != newsock->sk_user_data)
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT);
        else
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT);

        TOA_DBG("tcp_v4_syn_recv_sock_toa: set "
            "sk->sk_user_data to %p\n",
            newsock->sk_user_data);
    }
    return newsock;
}
  1. 调用原有函数 tcp_v4_syn_recv_sock 处理,也就是就里兼容了原有逻辑,普通非 toa 请求也会正常获取到 ip
  2. 额外调用 get_toa_data 生成地址,可以看到地址放到了 sk->sk_user_data 字段。

static void *get_toa_data(int af, struct sk_buff *skb)
{
    struct tcphdr *th;
    int length;
    unsigned char *ptr;

    TOA_DBG("get_toa_data called\n");

    if (NULL != skb) {
        th = tcp_hdr(skb);
        length = (th->doff * 4) - sizeof(struct tcphdr);
        ptr = (unsigned char *) (th + 1);

        while (length > 0) {
            int opcode = *ptr++;
            int opsize;
            switch (opcode) {
            case TCPOPT_EOL:
                return NULL;
            case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
                length--;
                continue;
            default:
                opsize = *ptr++;
                if (opsize < 2) /* "silly options" */
                    return NULL;
                if (opsize > length)
                    /* don't parse partial options */
                    return NULL;
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP4_TOA == opsize) {

                    struct toa_ip4_data tdata;
                    void *ret_ptr = NULL;

                    memcpy(&tdata, ptr - 2, sizeof(tdata));
                    TOA_DBG("af = %d, find toa data: ip = "
                        TOA_NIPQUAD_FMT", port = %u\n",
                        af,
                        TOA_NIPQUAD(tdata.ip),
                        ntohs(tdata.port));
                    if (af == AF_INET) {
                        memcpy(&ret_ptr, &tdata,
                            sizeof(ret_ptr));
                        TOA_DBG("coded ip4 toa data: %p\n",
                            ret_ptr);
                        return ret_ptr;
                    }
#ifdef TOA_IPV6_ENABLE
                    else if (af == AF_INET6) {
                        struct toa_ip6_data *ptr_toa_ip6 =
                            kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                        if (!ptr_toa_ip6) {
                            return NULL;
                        }
                        ptr_toa_ip6->opcode = opcode;
                        ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA;
                        ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0,
                            htonl(0x0000FFFF), tdata.ip);
                        TOA_DBG("coded ip6 toa data: %p\n",
                            ptr_toa_ip6);
                        TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                        return ptr_toa_ip6;
                    }
#endif
                }

#ifdef TOA_IPV6_ENABLE
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP6_TOA == opsize &&
                    af == AF_INET6) {
                    struct toa_ip6_data *ptr_toa_ip6 =
                        kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                    if (!ptr_toa_ip6) {
                            return NULL;
                    }
                    memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data));

                    TOA_DBG("find toa_v6 data : ip = "
                        TOA_NIP6_FMT", port = %u,"
                        " coded ip6 toa data: %p\n",
                        TOA_NIP6(ptr_toa_ip6->in6_addr),
                        ptr_toa_ip6->port,
                        ptr_toa_ip6);
                    TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                    return ptr_toa_ip6;
                }
#endif
                ptr += opsize - 2;
                length -= opsize;
            }
        }
    }
    return NULL;
}
  1. 遍历所有 option, 根据 opcode 来处理 ipv4 或是 ipv6
  2. 将 toa struct 复制一份,然后返回

然后当 real server 调用 getpeername 或是 getsocketname 时调用 inet_getname_toa 来获取 ip,如果是 ipv6 则调用 inet6_getname_toa

inet_getname_toa(struct socket *sock, struct sockaddr *uaddr,
        int *uaddr_len, int peer)
{
    int retval = 0;
    struct sock *sk = sock->sk;
    struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
    struct toa_ip4_data tdata;

    TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n",
        sk->sk_user_data);

    /* call orginal one */
    retval = inet_getname(sock, uaddr, uaddr_len, peer);

    /* set our value if need */
    if (retval == 0 && NULL != sk->sk_user_data && peer) {
        if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) {
            memcpy(&tdata, &sk->sk_user_data, sizeof(tdata));
            if (TCPOPT_TOA == tdata.opcode &&
                TCPOLEN_IP4_TOA == tdata.opsize) {
                TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
                TOA_DBG("inet_getname_toa: set new sockaddr, ip "
                    TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT
                    ", port %u -> %u\n",
                    TOA_NIPQUAD(sin->sin_addr.s_addr),
                    TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port),
                    ntohs(tdata.port));
                sin->sin_port = tdata.port;
                sin->sin_addr.s_addr = tdata.ip;
            } else { /* sk_user_data doesn't belong to us */
                TOA_INC_STATS(ext_stats,
                        GETNAME_TOA_MISMATCH_CNT);
                TOA_DBG("inet_getname_toa: invalid toa data, "
                    "ip "TOA_NIPQUAD_FMT" port %u opcode %u "
                    "opsize %u\n",
                    TOA_NIPQUAD(tdata.ip), ntohs(tdata.port),
                    tdata.opcode, tdata.opsize);
            }
        } else {
            TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT);
        }
    } else { /* no need to get client ip */
        TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT);
    }

    return retval;
}
  1. 调用原有 inet_getname 函数,获取 ip,兼容原有内核逻辑
  2. 判断 sk_user_data 不为空,并且结构体 op-kind op-length 与 ipv4 toa 的相等,获取 ip port ,并填充 sin

小结

实现原理还真简单,只不过有两个隐患。

  1. 如果 option 以后扩充其它内容,长度不够咋办?资源本身就不多
  2. op-kind 254 现在不被 tcp/ip 官方认可,以后会不会被占用?
赞(7) 打赏
转载请注明来源:IT技术资讯 » dpvs学习笔记: 12 TOA 实现原理

评论 抢沙发

评论前必须登录!

 

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏