hpcgroup
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 15 additions & 0 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/workflows/formatting.yaml‎
Lines changed: 32 additions & 0 deletions b/‎.github/workflows/formatting.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 0 deletions b/‎README.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎benchmarking/plot.sh‎
Lines changed: 33 additions & 0 deletions b/‎benchmarking/plot.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎benchmarking/plot_comm_model_avg.py‎
Lines changed: 34 additions & 0 deletions b/‎benchmarking/plot_comm_model_avg.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎benchmarking/plot_val_text.py‎
Lines changed: 44 additions & 0 deletions b/‎benchmarking/plot_val_text.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎benchmarking/process_comm_model.py‎
Lines changed: 117 additions & 0 deletions b/‎benchmarking/process_comm_model.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎benchmarking/pyg_serial.py‎
Lines changed: 108 additions & 0 deletions b/‎benchmarking/pyg_serial.py‎
Lines changed: 108 additions & 0 deletions
@@ -0,0 +1,15 @@
+name: ci 
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+concurrency:
+  group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
+  cancel-in-progress: true
+
+jobs:
+  formatting:
+    uses: ./.github/workflows/formatting.yaml
@@ -0,0 +1,32 @@
+name: formatting tests
+
+on:
+  workflow_dispatch:
+  workflow_call:
+
+concurrency:
+  group: unit_tests-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
+  cancel-in-progress: true
+
+jobs:
+  formatting:
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+
+    - name: Update black
+      run: |
+        pip install --upgrade black
+
+    - name: Lint and Format Check with black
+      run: |
+        black --diff --check .
@@ -0,0 +1,9 @@
+# Training GNNs with AxoNN
+
+## Directory Structure
+
+- **main**: Contains the parallel implementation and the core code for training the model.
+- **scripts**: Contains all shell scripts to run experiments and benchmarks. This is where you can find the scripts to set up and execute various experiments.
+- **results**: The output files of experiments are stored here, along with plotting scripts to visualize the results.
+- **validation**: Contains baselines used for comparison and validation purposes.
+- **performance**: Holds the code for performance modeling and benchmarking.
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Loop through all directories in the current folder
+for gpu_dir in */; do
+    gpu_dir=${gpu_dir%/}  # Remove trailing slash
+    
+    # Check if it is a directory
+    if [ -d "$gpu_dir" ]; then
+        
+        # Loop through subdirectories 0, 1, and 2
+        for sub_dir in 0 1 2; do
+            dir="./$gpu_dir/$sub_dir"
+            
+            # Check if the subdirectory exists
+            if [ -d "$dir" ]; then
+                # Run the Python script inside the subdirectory
+                (cd "$dir" && python ../../process_comm_model.py)
+                
+                # Copy and rename times.npy to the top-level directory
+                npy_file="$dir/times.npy"
+                if [ -f "$npy_file" ]; then
+                    cp "$npy_file" "./times_${gpu_dir}_${sub_dir}.npy"
+                    echo "Copied and renamed $npy_file to ./times_${gpu_dir}_${sub_dir}.npy"
+                fi
+            else
+                echo "Subdirectory $dir not found, skipping."
+            fi
+        done
+    fi
+done
+
+# Run the final plotting script
+python plot_comm_model_avg.py
@@ -0,0 +1,34 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+
+def aggregate_npy_data(directory):
+    num_gpus = [4, 8, 16, 32, 64, 128]
+
+    mean_comm_times, std_comm_times, mean_epoch_times, std_epoch_times = [], [], [], []
+
+    # Read all .npy files in the directory
+    for i in range(len(num_gpus)):
+        comm_times_list, epoch_times_list = [], []
+        for file in os.listdir(directory):
+            if "_" + str(num_gpus[i]) + "_" in file and file.endswith(".npy"):
+                data = np.load(os.path.join(directory, file), allow_pickle=True)
+                if len(data) >= 2:
+                    comm_times, epoch_times = data
+                    comm_times_list.append(comm_times)
+                    epoch_times_list.append(epoch_times)
+        mean_comm_times.append(np.mean(np.array(comm_times_list), axis=0))
+        std_comm_times.append(np.std(np.array(comm_times_list), axis=0))
+        mean_epoch_times.append(np.mean(np.array(epoch_times_list), axis=0))
+        std_epoch_times.append(np.std(np.array(comm_times_list), axis=0))
+
+    print((np.array(mean_epoch_times) - np.array(mean_comm_times)).flatten().tolist())
+
+    np.save(
+        "scaling_perlmutter_reddit.npy",
+        (mean_comm_times, std_comm_times, mean_epoch_times, std_epoch_times),
+    )
+
+
+aggregate_npy_data(os.getcwd())
@@ -0,0 +1,44 @@
+import os
+import re
+import matplotlib.pyplot as plt
+
+# Directory containing the files
+directory = "./"  # Change if needed
+
+# Pattern to extract config name and train loss values
+file_pattern = re.compile(r"products_(.+)\.txt")
+loss_pattern = re.compile(r"Epoch: \d+, Train Loss: ([\d\.]+)")
+
+# Dictionary to store losses per config
+losses_dict = {}
+
+# Iterate over all files in the directory
+for filename in os.listdir(directory):
+    match = file_pattern.match(filename)
+    if match:
+        config_name = match.group(1)
+        losses = []
+
+        # Read the file and extract losses
+        with open(os.path.join(directory, filename), "r") as file:
+            for line in file:
+                loss_match = loss_pattern.search(line)
+                if loss_match:
+                    losses.append(float(loss_match.group(1)))
+
+        # Store the extracted losses
+        if losses:
+            losses_dict[config_name] = losses
+
+# Plot the losses
+plt.figure(figsize=(10, 6))
+for config, losses in losses_dict.items():
+    plt.plot(losses, label=config)
+
+plt.xlabel("Epochs")
+plt.ylabel("Train Loss")
+plt.title("Training Loss per Configuration")
+plt.legend()
+plt.grid(True)
+
+plt.savefig("val.png")
@@ -0,0 +1,117 @@
+import re
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from comm_model import compute_config_costs
+from comp_model import comp_model
+
+
+def extract_avg_time(line):
+    match = re.search(r"Avg Time: ([0-9]*\.?[0-9]+)", line)
+    return float(match.group(1)) if match else 0
+
+
+def process_log_file(filename):
+    comm_times, epoch_times = [], []
+    comm_time, comp_time, cross_time = None, None, None
+
+    with open(filename, "r") as file:
+        for line in file:
+            line = line.strip()
+
+            if (
+                "epoch " in line
+                and comm_time is not None
+                and comp_time is not None
+                and cross_time is not None
+            ):
+                epoch_times.append(comp_time + comm_time + cross_time)
+                comm_times.append(comm_time)
+                comm_time = 0
+                comp_time = 0
+                cross_time = 0
+            elif "epoch " in line:
+                comm_time = 0
+                comp_time = 0
+                cross_time = 0
+            elif comm_time is not None and any(
+                keyword in line
+                for keyword in ["gather ", "all-reduce ", "reduce-scatter "]
+            ):
+                comm_time += extract_avg_time(line)
+            elif comp_time is not None and any(
+                keyword in line
+                for keyword in [
+                    "AGG = A * H ",
+                    # "OUT = AGG * W ",
+                    # "GRAD_W = AGG.T * GRAD_OUT ",
+                    # "GRAD_AGG = GRAD_OUT * W.T ",
+                    "GRAD_H = A.T * GRAD_AGG ",
+                ]
+            ):
+                comp_time += extract_avg_time(line)
+            elif cross_time is not None and any(
+                keyword in line for keyword in ["cross entropy"]
+            ):
+                cross_time += extract_avg_time(line)
+
+    if comm_time is not None and comp_time is not None and cross_time is not None:
+        # epoch_times.append(comp_time + comm_time + cross_time)
+        epoch_times.append(comp_time + comm_time)
+        comm_times.append(comm_time)
+
+    return sum(epoch_times[1:]) / (len(epoch_times) - 1), sum(comm_times[1:]) / (
+        len(comm_times) - 1
+    )
+
+
+def parse_config(filename):
+    match = re.search(r"reddit_X(\d+)Y(\d+)Z(\d+)\.txt", filename)
+    if match:
+        x, y, z = map(int, match.groups())
+        return (x, y, z)
+    return None
+
+
+def main():
+    num_configs = len([f for f in os.listdir() if f.endswith(".txt")])
+
+    epoch_times = [0] * num_configs
+    comm_times = [0] * num_configs
+
+    num_gpus = None
+    for filename in os.listdir():
+        if filename.startswith("reddit_") and filename.endswith(".txt"):
+            config = parse_config(filename)
+            num_gpus = config[0] * config[1] * config[2]
+
+            """
+            CONFIG_RANKS = compute_config_costs(
+                num_gpus, 232965, [602, 128, 128, 41], "v3", "perlmutter"
+            )
+            """
+
+            CONFIG_RANKS = comp_model(232965, 114848857, num_gpus, [602, 128, 128])
+
+            sorted_items = sorted(CONFIG_RANKS.items(), key=lambda x: x[1])
+
+            for i in range(len(sorted_items)):
+                CONFIG_RANKS[sorted_items[i][0]] = i
+
+            if config and config in CONFIG_RANKS:
+                rank = CONFIG_RANKS[config]
+                avg_epoch_time, avg_comm_time = process_log_file(filename)
+
+                if avg_comm_time > 0 and avg_epoch_time > 0:
+                    rank = 0
+                    comm_times[rank] = avg_comm_time
+                    epoch_times[rank] = avg_epoch_time
+
+    x_ticks = list(range(len(CONFIG_RANKS)))
+    x_labels = list(range(len(CONFIG_RANKS)))
+
+    np.save("times", (comm_times, epoch_times))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,108 @@
+import os
+import math
+import torch
+import random
+import argparse
+import numpy as np
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import GCNConv
+from torch_geometric.datasets import Reddit
+from torch_geometric.data.storage import GlobalStorage
+from torch_geometric.data.data import DataEdgeAttr, DataTensorAttr
+
+
+torch.serialization.add_safe_globals([GlobalStorage, DataEdgeAttr, DataTensorAttr])
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def create_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--download_path", type=str)
+    parser.add_argument("--num_epochs", type=int, default=2)
+    return parser
+
+
+def get_dataset(download_path=None):
+    # dataset = Reddit(download_path, transform=T.NormalizeFeatures())
+    # dataset = PygNodePropPredDataset(name="ogbn-products", root=input_dir, transform=T.NormalizeFeatures())
+    # gcn_norm = T.GCNNorm()
+    # return (gcn_norm.forward(dataset[0]), dataset.num_classes)
+    return torch.load(download_path)
+
+
+class Net(torch.nn.Module):
+    def __init__(self, num_input_features, num_classes):
+        super(Net, self).__init__()
+
+        self.conv1 = GCNConv(num_input_features, 128, normalize=False, bias=False)
+        self.conv2 = GCNConv(128, 128, normalize=False, bias=False)
+        self.conv3 = GCNConv(128, num_classes, normalize=False, bias=False)
+
+        torch.nn.init.kaiming_uniform_(self.conv1.lin.weight, a=math.sqrt(5))
+        torch.nn.init.kaiming_uniform_(self.conv2.lin.weight, a=math.sqrt(5))
+        torch.nn.init.kaiming_uniform_(self.conv3.lin.weight, a=math.sqrt(5))
+
+    def forward(self, x, edge_index):
+        x = self.conv1(x, edge_index)
+        x = F.relu(x)
+        x = self.conv2(x, edge_index)
+        x = F.relu(x)
+        x = self.conv3(x, edge_index)
+        return x
+
+
+def train(model, optimizer, input_features, adj, labels):
+    model.train()
+
+    optimizer.zero_grad()
+
+    output = model(input_features, adj)
+
+    loss = F.cross_entropy(output, labels)
+
+    loss.backward()
+
+    optimizer.step()
+
+    return loss
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+    set_seed(args.seed)
+
+    data, num_classes = get_dataset(args.download_path)
+    num_input_features = data.x.shape[1]
+
+    data.y = data.y.type(torch.LongTensor)
+    data.y = data.y.to(torch.device("cuda"))
+
+    features_local = data.x.to(torch.device("cuda")).requires_grad_()
+
+    model = Net(num_input_features, num_classes).to(torch.device("cuda"))
+
+    optimizer = torch.optim.AdamW(
+        list(model.parameters()) + [features_local], lr=3e-3, weight_decay=0
+    )
+
+    adj = torch.sparse_coo_tensor(
+        data.edge_index, data.edge_weight, (data.x.shape[0], data.x.shape[0])
+    )
+    adj = adj.to_sparse_csr()
+    adj = adj.to(torch.device("cuda"))
+
+    losses = []
+    for i in range(args.num_epochs):
+        loss = train(model, optimizer, features_local, adj, data.y)
+        losses.append(loss.item())
+        log = "Epoch: {:03d}, Train Loss: {:.4f}"
+        print(log.format(i, loss))