cesmix-mit · emmanuellujan · Jun 17, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 ## [WIP] PotentialLearning.jl
 
-Developing Optimization Workflows for Fast and Accurate Interatomic Potentials. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
+Composable Optimization Workflows for Fast and Accurate Interatomic Potentials. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
 
 <!--<a href="https://cesmix-mit.github.io/PotentialLearning.jl/stable">
 <img alt="Stable documentation" src="https://img.shields.io/badge/documentation-stable%20release-blue?style=flat-square">

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -3,6 +3,7 @@ AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Determinantal = "2673d5e8-682c-11e9-2dfd-471b09c6c819"
+DisplayAs = "0b91fe84-8a4c-11e9-3e1d-67c38462b6d6"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"

diff --git a/docs/make.jl b/docs/make.jl
@@ -22,6 +22,7 @@ const OUTPUT_DIR   = joinpath(@__DIR__, "src/generated")
 
 examples = [
     "Fit a-HfO2 dataset with ACE" => "ACE-aHfO2/fit-ace-aHfO2.jl",
+    "Subsample a-HfO2 dataset with DPP and fit with ACE" => "DPP-ACE-aHfO2-1/fit-dpp-ace-ahfo2.jl",
     "Subsample Na dataset with DPP and fit with ACE" => "DPP-ACE-Na/fit-dpp-ace-na.jl",
     "Subsample Si dataset with DPP, fit with ACE, and cross validate" => "DPP-ACE-Si/fit-dpp-ace-si.jl",
     "Load Ar+Lennard-Jones dataset and postprocess" => "LJ-Ar/lennard-jones-ar.jl"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,6 +1,7 @@
 # [WIP] PotentialLearning.jl
 
-**Developing optimization workflows for fast and accurate interatomic potentials**. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
+**Composable optimization workflows for fast and accurate interatomic potentials**. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
+
 
 ## Goals
 

diff --git a/examples/ACE-aHfO2/Project.toml b/examples/ACE-aHfO2/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DisplayAs = "0b91fe84-8a4c-11e9-3e1d-67c38462b6d6"
 InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/examples/ACE-aHfO2/fit-ace-aHfO2.jl b/examples/ACE-aHfO2/fit-ace-aHfO2.jl
@@ -1,88 +1,98 @@
+# # Fit a-HfO2 dataset with ACE
+
+# ## Load packages, define paths, and create experiment folder.
+
+# Load packages
 using AtomsBase, InteratomicPotentials, PotentialLearning
 using Unitful, UnitfulAtomic
-using LinearAlgebra, Random
+using LinearAlgebra, Random, DisplayAs
 
+# Define paths.
 path = joinpath(dirname(pathof(PotentialLearning)), "../examples/ACE-aHfO2")
+ds_path =  "$path/../data/a-HfO2/a-HfO2-300K-NVT-6000.extxyz"
+res_path = "$path/results/"
 
+# Load utility functions.
 include("$path/../utils/utils.jl")
 
-# Setup experiment
-
-# Experiment folder
-path = "$path/results/"
-run(`mkdir -p $path`)
+# Create experiment folder.
+run(`mkdir -p $res_path`)
 
-# Define training and test configuration datasets
+# ## Load atomistic dataset and split it into training and test.
 
-# Load complete configuration dataset
-ds_path = string("$path/../../data/a-HfO2/a-HfO2-300K-NVT-6000.extxyz")
+# Load atomistic dataset: atomistic configurations (atom positions, geometry, etc.) + DFT data (energies, forces, etc.)
 ds = load_data(ds_path, uparse("eV"), uparse("Å"))
 
-# Split configuration dataset into training and test
-n_train, n_test = 50, 50
+# Split atomistic dataset into training and test
+n_train, n_test = 50, 50 # only 50 samples per dataset are used in this example.
 conf_train, conf_test = split(ds, n_train, n_test)
 
-# Define IAP model
+# ## Create ACE basis, compute descriptors and add them to the dataset.
 
-# Define ACE basis
+# Create ACE basis
 basis = ACE(species           = [:Hf, :O],
             body_order        = 3,
             polynomial_degree = 3,
             rcutoff           = 5.0,
             wL                = 1.0,
             csp               = 1.0,
             r0                = 1.0)
-@save_var path basis
+@save_var res_path basis
 
-# Update training dataset by adding energy and force descriptors
+# Compute ACE descriptors for energy and forces based on the atomistic training configurations.
 println("Computing energy descriptors of training dataset...")
-B_time = @elapsed e_descr_train = compute_local_descriptors(conf_train, basis)
+e_descr_train = compute_local_descriptors(conf_train, basis;
+                                          pbar=false)
 println("Computing force descriptors of training dataset...")
-dB_time = @elapsed f_descr_train = compute_force_descriptors(conf_train, basis)
-GC.gc()
+f_descr_train = compute_force_descriptors(conf_train, basis;
+                                          pbar=false)
+
+# Update training dataset by adding energy and force descriptors.
 ds_train = DataSet(conf_train .+ e_descr_train .+ f_descr_train)
 
-# Learn
+# ## Learn ACE coefficients based on ACE descriptors and DFT data.
 println("Learning energies and forces...")
 lb = LBasisPotential(basis)
 ws, int = [1.0, 1.0], false
 learn!(lb, ds_train, ws, int)
-
-@save_var path lb.β
-@save_var path lb.β0
+@save_var res_path lb.β
+@save_var res_path lb.β0
 lb.β, lb.β0
 
-# Post-process output: calculate metrics, create plots, and save results
+# ## Post-process output: calculate metrics, create plots, and save results.
 
-# Update test dataset by adding energy and force descriptors
+# Compute ACE descriptors for energy and forces based on the atomistic test configurations.
 println("Computing energy descriptors of test dataset...")
-e_descr_test = compute_local_descriptors(conf_test, basis)
+e_descr_test = compute_local_descriptors(conf_test, basis;
+                                         pbar = false)
 println("Computing force descriptors of test dataset...")
-f_descr_test = compute_force_descriptors(conf_test, basis)
-GC.gc()
+f_descr_test = compute_force_descriptors(conf_test, basis;
+                                         pbar = false)
+
+# Update test dataset by adding energy and force descriptors.
 ds_test = DataSet(conf_test .+ e_descr_test .+ f_descr_test)
 
-# Get true and predicted values
+# Get true and predicted values for energies and forces.
 n_atoms_train = length.(get_system.(ds_train))
 n_atoms_test = length.(get_system.(ds_test))
 
 e_train, e_train_pred = get_all_energies(ds_train) ./ n_atoms_train,
                         get_all_energies(ds_train, lb) ./ n_atoms_train
 f_train, f_train_pred = get_all_forces(ds_train),
                         get_all_forces(ds_train, lb)
-@save_var path e_train
-@save_var path e_train_pred
-@save_var path f_train
-@save_var path f_train_pred
+@save_var res_path e_train
+@save_var res_path e_train_pred
+@save_var res_path f_train
+@save_var res_path f_train_pred
 
 e_test, e_test_pred = get_all_energies(ds_test) ./ n_atoms_test,
                       get_all_energies(ds_test, lb) ./ n_atoms_test
 f_test, f_test_pred = get_all_forces(ds_test),
                       get_all_forces(ds_test, lb)
-@save_var path e_test
-@save_var path e_test_pred
-@save_var path f_test
-@save_var path f_test_pred
+@save_var res_path e_test
+@save_var res_path e_test_pred
+@save_var res_path f_test
+@save_var res_path f_test_pred
 
 # Compute training metrics
 e_train_metrics = get_metrics(e_train, e_train_pred,
@@ -92,7 +102,7 @@ f_train_metrics = get_metrics(f_train, f_train_pred,
                               metrics = [mae, rmse, rsq, mean_cos],
                               label = "f_train")
 train_metrics = merge(e_train_metrics, f_train_metrics)
-@save_dict path train_metrics
+@save_dict res_path train_metrics
 train_metrics
 
 # Compute test metrics
@@ -103,36 +113,36 @@ f_test_metrics = get_metrics(f_test, f_test_pred,
                              metrics = [mae, rmse, rsq, mean_cos],
                              label = "f_test")
 test_metrics = merge(e_test_metrics, f_test_metrics)
-@save_dict path test_metrics
+@save_dict res_path test_metrics
 test_metrics
 
 # Plot and save energy results
 e_plot = plot_energy(e_train, e_train_pred,
                      e_test, e_test_pred)
-@save_fig path e_plot
-e_plot
+@save_fig res_path e_plot
+DisplayAs.PNG(e_plot)
 
 # Plot and save force results
 f_plot = plot_forces(f_train, f_train_pred,
                      f_test, f_test_pred)
-@save_fig path f_plot
-f_plot
+@save_fig res_path f_plot
+DisplayAs.PNG(f_plot)
 
 # Plot and save training force cosine
 e_train_plot = plot_energy(e_train, e_train_pred)
 f_train_plot = plot_forces(f_train, f_train_pred)
 f_train_cos  = plot_cos(f_train, f_train_pred)
-@save_fig path e_train_plot
-@save_fig path f_train_plot
-@save_fig path f_train_cos
-f_train_cos
+@save_fig res_path e_train_plot
+@save_fig res_path f_train_plot
+@save_fig res_path f_train_cos
+DisplayAs.PNG(f_train_cos)
 
 # Plot and save test force cosine
 e_test_plot = plot_energy(e_test, e_test_pred)
 f_test_plot = plot_forces(f_test, f_test_pred)
 f_test_cos  = plot_cos(f_test, f_test_pred)
-@save_fig path e_test_plot
-@save_fig path f_test_plot
-@save_fig path f_test_cos
-f_test_cos
+@save_fig res_path e_test_plot
+@save_fig res_path f_test_plot
+@save_fig res_path f_test_cos
+DisplayAs.PNG(f_test_cos)
 
diff --git a/examples/DPP-ACE-Na/fit-dpp-ace-na.jl b/examples/DPP-ACE-Na/fit-dpp-ace-na.jl
@@ -1,16 +1,28 @@
+# # Subsample Na dataset with DPP and fit energies with ACE
+
+# ## Load packages and define paths.
+
+# Load packages
 using Unitful, UnitfulAtomic
 using AtomsBase, InteratomicPotentials, PotentialLearning
 using LinearAlgebra, Plots
 
-# Load dataset
+# Define paths.
 path = joinpath(dirname(pathof(PotentialLearning)), "../examples/DPP-ACE-Na")
-confs, thermo = load_data("$path/../data/Na/liquify_sodium.yaml", YAML(:Na, u"eV", u"Å"))
+ds_path = "$path/../data/Na/liquify_sodium.yaml"
+
+# ## Load atomistic dataset and split it into training and test.
+
+# Load atomistic dataset: atomistic configurations (atom positions, geometry, etc.) + DFT data (energies, forces, etc.).
+confs, thermo = load_data(ds_path, YAML(:Na, u"eV", u"Å"))
 confs, thermo = confs[220:end], thermo[220:end]
 
-# Split dataset
+# Split atomistic dataset into training and test.
 conf_train, conf_test = confs[1:1000], confs[1001:end]
 
-# Define ACE
+# ## Create ACE basis, compute energy descriptors and add them to the dataset.
+
+# Create ACE basis.
 ace = ACE(species = [:Na],         # species
           body_order = 4,          # 4-body
           polynomial_degree = 8,   # 8 degree polynomials
@@ -19,39 +31,46 @@ ace = ACE(species = [:Na],         # species
           r0 = 1.0,                # minimum distance between atoms
           rcutoff = 5.0)           # cutoff radius 
 
-# Update training dataset by adding energy (local) descriptors
+# Update training dataset by adding energy (local) descriptors.
 println("Computing local descriptors of training dataset")
-e_descr_train = compute_local_descriptors(conf_train, ace)
-#e_descr_train = JLD.load("data/sodium_empirical_full.jld", "descriptors")
+e_descr_train = compute_local_descriptors(conf_train, ace) # JLD.load("data/sodium_empirical_full.jld", "descriptors")
+
+# Update training dataset by adding energy and force descriptors.
 ds_train = DataSet(conf_train .+ e_descr_train)
 
-# Learn using DPP
-lb = LBasisPotential(ace)
+# ## Subsampling via DPP.
+
+# Create DPP subselector.
 dpp = kDPP(ds_train, GlobalMean(), DotProduct(); batch_size = 200)
+
+# Subsample trainig dataset.
 dpp_inds = get_random_subset(dpp)
+
+# ## Learn ACE coefficients based on ACE descriptors and DFT data.
+lb = LBasisPotential(ace)
 α = 1e-8
 Σ = learn!(lb, ds_train[dpp_inds], α)
 
-# Post-process output
+# ## Post-process output: calculate metrics, create plots, and save results.
 
-# Update test dataset by adding energy and force descriptors
+# Update test dataset by adding energy descriptors.
 println("Computing local descriptors of test dataset")
 e_descr_test = compute_local_descriptors(conf_test, ace)
 ds_test = DataSet(conf_test .+ e_descr_test)
 
-# Get true and predicted energy values (assuming that all configurations have the same no. of atoms)
+# Get true and predicted energy values (assuming that all configurations have the same no. of atoms).
 n = size(get_system(ds_train[1]))[1]
 e_train, e_train_pred = get_all_energies(ds_train)/n, get_all_energies(ds_train, lb)/n
 e_test, e_test_pred   = get_all_energies(ds_test)/n, get_all_energies(ds_test, lb)/n
 
-# Compute and print metrics
+# Compute and print metrics.
 e_mae, e_rmse, e_rsq = calc_metrics(e_train, e_train_pred)
 println("MAE: $e_mae, RMSE: $e_rmse, RSQ: $e_rsq")
 
-# Plot energy error scatter
+# Plot energy error.
 e_err_train, e_err_test = (e_train_pred - e_train), (e_test_pred - e_test)
 dpp_inds2 = get_random_subset(dpp; batch_size = 20)
-scatter( e_train, e_err_train, label = "Training", color = :blue,
+p = scatter( e_train, e_err_train, label = "Training", color = :blue,
          markersize = 1.5, markerstrokewidth=0)
 scatter!(e_test, e_err_test, label = "Test", color = :yellow,
          markersize = 1.5, markerstrokewidth=0)

diff --git a/examples/DPP-ACE-Si/fit-dpp-ace-si.jl b/examples/DPP-ACE-Si/fit-dpp-ace-si.jl
@@ -1,20 +1,28 @@
+# # Subsample Si dataset and fit with ACE
+
+# ## Load packages, define paths, and create experiment folder.
+
+# Load packages
 using LinearAlgebra, Random, InvertedIndices
 using Statistics, StatsBase, Distributions, Determinantal
 using Unitful, UnitfulAtomic
 using AtomsBase, InteratomicPotentials, PotentialLearning
 using CSV, JLD, DataFrames
 
+# Define atomic type information
+elname, elspec = "Si", [:Si] 
+
+# Define paths.
 path = joinpath(dirname(pathof(PotentialLearning)), "../examples/DPP-ACE-Si")
+inpath = "$path/../data/Si-3Body-LAMMPS/"
+outpath = "$path/output/$elname/"
 
+# Load utility functions.
 include("$path/subsampling_utils.jl")
 
-# Load dataset
-elname = "Si"
-elspec = [:Si]
-inpath = "$path/../data/Si-3Body-LAMMPS/"
-outpath = "$path/output/$elname/"
+# ## Load atomistic datasets
 
-# Read all data
+# Load all atomistic datasets: atomistic configurations (atom positions, geometry, etc.) + DFT data (energies, forces, etc.)
 file_arr = readext(inpath, "xyz")
 nfile = length(file_arr)
 confs_arr = [load_data(inpath*file, ExtXYZ(u"eV", u"Å")) for file in file_arr]
@@ -29,7 +37,9 @@ for k = 1:nfile
     n += length(confs_arr[k])
 end
 
-# Define ACE basis
+# ## Subsampling by DPP
+
+# Create ACE basis
 nbody = 4
 deg = 5
 ace = ACE(species = elspec,             # species
@@ -40,17 +50,18 @@ ace = ACE(species = elspec,             # species
           r0 = 1.0,                     # minimum distance between atoms
           rcutoff = 10.0)
 
-# Update dataset by adding energy and force descriptors
+# Compute ACE descriptors for energies and forces.
 println("Computing local descriptors")
-e_descr = compute_local_descriptors(confs, ace)
-f_descr = compute_force_descriptors(confs, ace)
+e_descr = compute_local_descriptors(confs, ace; pbar=false)
+f_descr = compute_force_descriptors(confs, ace; pbar=false)
 JLD.save(outpath*"$(elname)_energy_descriptors.jld", "e_descr", e_descr)
 JLD.save(outpath*"$(elname)_force_descriptors.jld", "f_descr", f_descr)
 
+# Update training dataset by adding energy and force descriptors.
 ds = DataSet(confs .+ e_descr .+ f_descr)
 ndata = length(ds)
 
-# Compute cross validation error from training
+# ## Compute cross validation error from training dataset
 batch_size = [80, 40]
 sel_ind = Dict{Int64, Vector}()
 cond_num = Dict{Int64, Vector}()