NCCL源码解析④:建图过程
作者|KIDGINBROOK
【资料图】
更新|潘丽晨
原文:https://blog.csdn.net/KIDGIN7439/article/details/127493629
上次分析了NCCL对机器PCI系统进行拓扑分析的过程,产出的结果为xml格式,接下来,NCCL会根据这个xml进图的建立过程以便之后进行路径搜索。
ncclTopoGetSystem的最后会执行ncclTopoGetSystemFromXml将xml格式转成图格式。
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) { NCCLCHECK(ncclCalloc(topoSystem, 1)); struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); for (int s=0; s<topNode->nSubs; s++) { struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); NCCLCHECK(ncclTopoSortSystem(*topoSystem)); return ncclSuccess;}
从xml中拿到根节点"system",然后遍历子节点中的"cpu",对每个cpu通过ncclTopoAddCpu进行建图,这里一个cpu其实就是一个numa。
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { int numaId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); struct ncclTopoNode* cpu; NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId)); const char* str; NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); if (str != NULL) { NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity)); } NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) { NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; } } for (int s=0; s<xmlCpu->nSubs; s++) { struct ncclXmlNode* node = xmlCpu->subs[s]; if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0)); NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH)); NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH)); } NCCLCHECK(ncclTopoAddNic(node, system, nic)); } } return ncclSuccess;}
接着创建一个cpu node,id为numaid,设置cpu的affinity,即该numa对应的核,设置cpu对应vendor等信息。
然后遍历cpu node的子节点,根据不同的类型执行不同的函数,如果是PCI节点,则执行ncclTopoAddPci。
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { const char* str; int type; NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); int64_t busId; NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); NCCLCHECK(busIdToInt64(str, &busId)); struct ncclTopoNode* node = NULL; if (type == GPU) { struct ncclXmlNode* xmlGpu; NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); if (xmlGpu == NULL) return ncclSuccess; int index; NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); if (index == -1) return ncclSuccess; NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node)); } if (type == NIC) { struct ncclXmlNode* xmlNic; NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); if (xmlNic == NULL) return ncclSuccess; // Ignore sub device ID and merge multi-port NICs into one PCI device. busId &= 0xfffffffffffffff0; struct ncclTopoNode* nicNode = NULL; NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId)); node = nicNode; // Connect it to parent later on } NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode)); } else if (type == PCI) { NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); for (int s=0; s<xmlPci->nSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node)); } } if (node) { int width, speed; NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); // Manage cases where speed was not indicated in /sys if (width == 0) width = 16; NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0)); } return ncclSuccess;}
首先获取pci的type和busId, 然后判断type,如果是PCI,那么创建一个PCI node,递归执行ncclTopoAddPci,直到遇到NIC或者GPU xml节点。
如果遇到的是NIC,那么创建NIC节点,然后执行ncclTopoAddNic,这里会在xml nic下遍历xml net,对每个xml net创建net node,id为dev,然后设置speed,port,gdr等属性。
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); struct ncclTopoNode* net; NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev)); const char* str; NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if (str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; ncclDebugNoWarn = NCCL_GRAPH; int mbps; if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0; if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.width = mbps / 8000.0; if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0; if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0; if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS; if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0; ncclDebugNoWarn = 0; NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width)); return ncclSuccess;} ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { for (int s=0; s<xmlNic->nSubs; s++) { struct ncclXmlNode* xmlNet = xmlNic->subs[s]; if (strcmp(xmlNet->name, "net") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); if (index == -1) continue; NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic)); } return ncclSuccess;}
然后通过建立net node到nic node的正反向边,设置边的类型,边上累计带宽,并且当前节点的边按照带宽从大到小排序。
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) { // Aggregate links into higher width for NVLink struct ncclTopoLink* link; for (link = node->links; link->remNode; link++) { if (link->remNode == remNode && link->type == type) break; } if (link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; link->width += width; // Sort links in BW descending order struct ncclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); while (link != node->links) { if ((link-1)->width >= linkSave.width) break; memcpy(link, link-1, sizeof(struct ncclTopoLink)); link--; } memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); return ncclSuccess;}
到这里就添加完成了NIC,回到ncclTopoAddPci里,如果是gpu的话则创建gpu node,然后设置gpu node的rank,dev,gdr等属性。最后通过ncclTopoConnectNodes建立当前节点到子节点的双向边。
到这里就完成了每个numa节点下的建图,然后开始添加nvlink和QPI以连接,先看下nvlink。
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { if (strcmp(node->name, "nvlink") == 0) { struct ncclTopoNode* gpu = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { WARN("Add NVLink error : could not find GPU %lx\n", pBusId); return ncclInternalError; } int count; NCCLCHECK(xmlGetAttrInt(node, "count", &count)); const char* targetClass; NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); int targetType; NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); struct ncclTopoNode* remote = NULL; if (targetType == GPU) { // NVL P2P connection to another GPU const char* target; NCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; NCCLCHECK(busIdToInt64(target, &busId)); NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId)); } else if (targetType == CPU) { // NVL connection to the local CPU NCCLCHECK(findLocalCpu(gpu, &remote)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); } else { remote = system->nodes[NVS].nodes; } } if (remote) { int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH; NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed)); if (remote->type != GPU) { NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed)); } } } else { const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; s<node->nSubs; s++) { NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId)); } } return ncclSuccess;}
从根节点递归遍历下去,直到遇到nvlink xml节点,然后拿到nvlink的父节点,即gpu节点,然后通过tclass获取对端PCI设备类型,如果是gpu或者cpu,直接返回对端node,如果是nvswitch,那就先创建nvswitch节点,然后创建当前gpu节点和对端的双向边。然后通过ncclTopoConnectCpus将cpu两两连接。
最后为了方便后续搜索channel,通过ncclTopoSort递归将每个PCI节点的边按照nvlink,向下的PCI连接,向上的PCI连接,QPI的顺序进行排序,因为建边的过程中已经按照带宽排序过,所以nvlink一定在最前边,QPI一定在最后,因此只需要对中间的PCI排序即可。
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { // Shift all links to have upLink as last link if (upNode) { int l=0; while (node->links[l].remNode != upNode) l++; struct ncclTopoLink upLink; memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); while (node->links[l+1].remNode) { memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); l++; } memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); } // Recursively sort the PCI tree for (int l=0; l<node->nlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); } return ncclSuccess;}
到这里就完成了整个的建图过程。总结下,由于拓扑分析产出的xml不便于进行后续的路径搜索,所以本节基于xml对PCI系统进行了建图。
欢迎 Star、试用 OneFlow 最新版本:
https://github.com/Oneflow-Inc/oneflow/
关键词:
推荐阅读
月壤形成的主要原因 月壤与土壤有什么区别
月壤形成的主要原因月壤形成过程没有生物活动参与,没有有机质,还极度缺水干燥;组成月壤的矿物粉末基本是由陨石撞击破砰形成,因此,粉末 【详细】
域名抢注是是什么意思?投资角度来看什么域名好?
域名抢注是是什么意思域名抢注是通过抢先注册的方式获得互联网删除的域名的使用权。域名是由点分隔的一串数字,用于标记一台计算机或一组计 【详细】
捷达保养费用是多少?捷达是哪个国家的品牌?
捷达保养费用是多少?全新捷达的保修期为2年或6万公里,以先到者为准,新车可享受一次免费保养,首次免费保养在5000-7500km或1年内进行。如 【详细】
天然气泄露会造成爆炸吗?天然气泄漏怎么办?
天然气泄露会造成爆炸吗?家里用的天然气如果泄露是会发生爆炸的。当空气中含有混合天然气时,在与火源接触的一系列爆炸危险中,就会发生爆 【详细】
四部门明确App收集个人信息范围 个人信息保护范围判断标准
四部门明确App收集个人信息范围近日,国家互联网信息办公室、工业和信息化部、公安部、国家市场监督管理总局联合印发《常见类型移动互联网 【详细】
相关新闻
- NCCL源码解析④:建图过程
- 快讯:华为Mate60系列稳了,首发鸿蒙4.0,9月发布力抗iPhone15
- 宇航员连牺牲都不怕,为什么在月球上观察地球时却感到恐惧?
- 世界消息!工商银行筑梦中华金条100克价格今天多少一克(2023年07月04日)
- 每日快讯!富德生命 以案说险:发生风险莫慌张,资料齐全快速赔
- 新型铁电材料可变身机器人“肌肉” 全球快资讯
- 7月4日半导体概念板块涨幅达2%-环球关注
- 动态:公司销售毛利率上升 容大感光上半年净利预增98%-128%
- 全球热推荐:赋能产业,数智皮城如何领跑时尚新赛道
- 热讯:消息称iPhone15/15 Plus新增“青绿色” 采用磨砂玻璃材质
- VNL分站赛结束,但仍有两项女排比赛!国少冲冠、云南有邀请赛! 世界百事通
- 字节发布火山方舟:让大模型服务与应用像打车一样简单-环球关注
- 危机升级!荷兰宣布光刻机管制加码,中国芯片自主要走出新路
- 天天实时:若想5年内不换手机:建议“一步到位”,目前这5款手机符合要求
- 微信 iOS 版更新,带来了一些新变化|世界观焦点
- 警惕!“杀猪盘”蔓延威胁全球网络用户
- 长江上游古老航道焕发“绿色”新活力
- 环球消息!我国多种矿产勘查开发实现新突破
- 环球观热点:2023呼和浩特市小升初摇号时间+结果查询入口
- 环球讯息:以桂林命名的这种植物美出新高度