Skip to content

Examples

These examples demonstrate describing a variety of devices and infrastructures using text descriptions and diagrams to defining them using a standardized schema.

DGX-A100 Server

This server diagram acts as an example of how multiple components can be connected to a single component such as multiple gpu components are connected to a single pcie switch.

The graph model is able to capture the asymmetric layout of the device.

Description

dgxa100

Standardized Definition

DGX device definition using OpenApiArt generated classes
from typing import Optional
from infragraph import *

# pyright: reportArgumentType=false


class Dgx(Device):
    def __init__(self, nic_device: Optional[Device] = None):
        """Adds an InfraGraph device to infrastructure based on the following components:
        - 2 cpus
        - 8 npus
        - 4 pcie switches
        - 8 nics
        - 1 nvlink switch
        """
        super(Device, self).__init__()
        self.name = "dgx"
        self.description = "Nvidia DGX System"

        cpu = self.components.add(
            name="cpu",
            description="AMD Epyc 7742 CPU",
            count=2,
        )
        cpu.choice = Component.CPU
        npu = self.components.add(
            name="npu",
            description="Nvidia A100 GPU",
            count=8,
        )
        npu.choice = Component.NPU
        nvlsw = self.components.add(
            name="nvlsw",
            description="NVLink Switch",
            count=1,
        )
        nvlsw.choice = Component.CUSTOM
        pciesw = self.components.add(
            name="pciesw",
            description="PCI Express Switch Gen 4",
            count=4,
        )
        pciesw.choice = Component.CUSTOM
        if nic_device is None:
            nic = self.components.add(
                name="nic",
                description="Generic Nic",
                count=8,
            )
            nic.choice = Component.NIC
        else:
            nic = self.components.add(
                name=nic_device.name,
                description=nic_device.description,
                count=8,
            )
            nic.choice = Component.DEVICE

        cpu_fabric = self.links.add(name="fabric", description="AMD Infinity Fabric")
        pcie = self.links.add(name="pcie")
        nvlink = self.links.add(name="nvlink")

        edge = self.edges.add(scheme=DeviceEdge.MANY2MANY, link=cpu_fabric.name)
        edge.ep1.component = cpu.name
        edge.ep2.component = cpu.name

        edge = self.edges.add(scheme=DeviceEdge.MANY2MANY, link=nvlink.name)
        edge.ep1.component = npu.name
        edge.ep2.component = nvlsw.name

        for npu_idx, pciesw_idx in zip(["0:2", "2:4", "4:6", "6:8"], range(pciesw.count)):
            edge = self.edges.add(scheme=DeviceEdge.MANY2MANY, link=pcie.name)
            edge.ep1.component = f"{npu.name}[{npu_idx}]"
            edge.ep2.component = f"{pciesw.name}[{pciesw_idx}]"

        for nic_idx, pciesw_idx in zip(["0:2", "2:4", "4:6", "6:8"], range(pciesw.count)):
            edge = self.edges.add(scheme=DeviceEdge.MANY2MANY, link=pcie.name)
            edge.ep1.component = f"{nic.name}[{nic_idx}]"
            edge.ep2.component = f"{pciesw.name}[{pciesw_idx}]"


if __name__ == "__main__":
    device = Dgx()
    print(device.serialize(encoding=Device.YAML))
DGX device definition as yaml
components:
- choice: cpu
  count: 2
  description: AMD Epyc 7742 CPU
  name: cpu
- choice: npu
  count: 8
  description: Nvidia A100 GPU
  name: npu
- choice: custom
  count: 1
  description: NVLink Switch
  name: nvlsw
- choice: custom
  count: 4
  description: PCI Express Switch Gen 4
  name: pciesw
- choice: nic
  count: 8
  description: Generic Nic
  name: nic
description: Nvidia DGX System
edges:
- ep1:
    component: cpu
  ep2:
    component: cpu
  link: fabric
  scheme: many2many
- ep1:
    component: npu
  ep2:
    component: nvlsw
  link: nvlink
  scheme: many2many
- ep1:
    component: npu[0:2]
  ep2:
    component: pciesw[0]
  link: pcie
  scheme: many2many
- ep1:
    component: npu[2:4]
  ep2:
    component: pciesw[1]
  link: pcie
  scheme: many2many
- ep1:
    component: npu[4:6]
  ep2:
    component: pciesw[2]
  link: pcie
  scheme: many2many
- ep1:
    component: npu[6:8]
  ep2:
    component: pciesw[3]
  link: pcie
  scheme: many2many
- ep1:
    component: nic[0:2]
  ep2:
    component: pciesw[0]
  link: pcie
  scheme: many2many
- ep1:
    component: nic[2:4]
  ep2:
    component: pciesw[1]
  link: pcie
  scheme: many2many
- ep1:
    component: nic[4:6]
  ep2:
    component: pciesw[2]
  link: pcie
  scheme: many2many
- ep1:
    component: nic[6:8]
  ep2:
    component: pciesw[3]
  link: pcie
  scheme: many2many
links:
- description: AMD Infinity Fabric
  name: fabric
- name: pcie
- name: nvlink
name: dgx

GH200-MGX

Description

NPU Component

spine and leaf

Device

spine and leaf

Standardized Definition

GH200-MGX device definition using OpenApiArt generated classes
TBD...

ScaleUp/ScaleOut Infrastructure

Description

ai-ml-hpc-datacenter-networks https://mips.com/blog/reimagining-ai-infrastructure-the-power-of-converged-back-end-networks/

  • 1024 hosts
    • 1 npu/host
    • 10 nics/host
  • 512 scaleup switches
    • 16 ports/switch
  • 2 scaleout switches
    • 1024 ports/switch
  • 64 Racks
    • 16 hosts/rack
    • 8 scale up switches/rack

ai-ml-hpc-datacenter-networks

Standardized Definition

ScaleUp/ScaleOut infrastructure definition using OpenApiArt generated classes
TBD...