cesmix-mit · emmanuellujan · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,11 +1,24 @@
 [deps]
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Determinantal = "2673d5e8-682c-11e9-2dfd-471b09c6c819"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
+InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PotentialLearning = "82b0a93c-c2e3-44bc-a418-f0f89b0ae5c2"
+ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
diff --git a/docs/make.jl b/docs/make.jl
@@ -21,7 +21,9 @@ const EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
 const OUTPUT_DIR   = joinpath(@__DIR__, "src/generated")
 
 examples = [
+    "Fit a-HfO2 dataset with ACE" => "ACE-aHfO2/fit-ace-aHfO2.jl",
     "Subsample Na dataset with DPP and fit with ACE" => "DPP-ACE-Na/fit-dpp-ace-na.jl",
+    "Subsample Si dataset with DPP, fit with ACE, and cross validate" => "DPP-ACE-Si/fit-dpp-ace-si.jl",
     "Load Ar+Lennard-Jones dataset and postprocess" => "LJ-Ar/lennard-jones-ar.jl"
 ]
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,15 +1,15 @@
 # [WIP] PotentialLearning.jl
 
-PotentialLerning.jl: **Developing optimization workflows for fast and accurate interatomic potentials**. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
+**Developing optimization workflows for fast and accurate interatomic potentials**. This package is part of a software suite developed for the [CESMIX](https://computing.mit.edu/cesmix/) project.
 
 ## Goals
 
-**Optimize your atomistic data: intelligent subsampling of large datasets to reduce DFT computations**
+**Optimize your atomistic data:** intelligent subsampling of large datasets to reduce DFT computations
 - Intelligent subsampling of atomistic configurations using algorithms based on [DPP](https://github.com/dahtah/Determinantal.jl), [DBSCAN](https://docs.google.com/document/d/1SWAanEWQkpsbr2lqetMO3uvdX_QK-Z7dwrgPaM1Dl0o/edit), [CUR](https://github.com/JuliaLinearAlgebra/LowRankApprox.jl), etc.
 - Highly scalable parallel subsampling via hierarchical subsampling and distributed parallelism ([Dagger.jl](https://github.com/JuliaParallel/Dagger.jl)).
 - Optimal subsampler choosing via [Hyperopt.jl](https://github.com/baggepinnen/Hyperopt.jl).
 
-**Optimize your interatomic potential model: hyperparameters, coefficients, model compression, and model selection.**
+**Optimize your interatomic potential model:** hyperparameters, coefficients, model compression, and model selection.
 - Parallel optimization of hyperparameters, coefficients, and model selection via [Hyperopt.jl](https://github.com/baggepinnen/Hyperopt.jl); multi-objective optimization (Pareto fronts): force execution time vs fitting accuracy (e.g. MAE of energies and forces).
 - Model compression via feature selection (e.g. [CUR](https://github.com/JuliaLinearAlgebra/LowRankApprox.jl)) and dimensionality reduction (e.g [PCA](https://juliastats.org/MultivariateStats.jl/dev/pca/), Active Subspaces) of atomistic descriptors.
 - Fitting of linear potentials and inference of parameter uncertainties. Training of neural versions of [Julia-ACE](https://github.com/ACEsuit/ACE1.jl) and [LAMMPS-POD](https://docs.lammps.org/pair_pod.html).

diff --git a/examples/ACE-aHfO2/Project.toml b/examples/ACE-aHfO2/Project.toml
@@ -0,0 +1,13 @@
+[deps]
+AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+PotentialLearning = "82b0a93c-c2e3-44bc-a418-f0f89b0ae5c2"
+ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
+UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
diff --git a/examples/ACE-aHfO2/fit-ace-aHfO2.jl b/examples/ACE-aHfO2/fit-ace-aHfO2.jl
@@ -0,0 +1,138 @@
+using AtomsBase, InteratomicPotentials, PotentialLearning
+using Unitful, UnitfulAtomic
+using LinearAlgebra, Random
+
+path = joinpath(dirname(pathof(PotentialLearning)), "../examples/ACE-aHfO2")
+
+include("$path/../utils/utils.jl")
+
+# Setup experiment
+
+# Experiment folder
+path = "$path/results/"
+run(`mkdir -p $path`)
+
+# Define training and test configuration datasets
+
+# Load complete configuration dataset
+ds_path = string("$path/../../data/a-HfO2/a-HfO2-300K-NVT-6000.extxyz")
+ds = load_data(ds_path, uparse("eV"), uparse("Å"))
+
+# Split configuration dataset into training and test
+n_train, n_test = 50, 50
+conf_train, conf_test = split(ds, n_train, n_test)
+
+# Define IAP model
+
+# Define ACE basis
+basis = ACE(species           = [:Hf, :O],
+            body_order        = 3,
+            polynomial_degree = 3,
+            rcutoff           = 5.0,
+            wL                = 1.0,
+            csp               = 1.0,
+            r0                = 1.0)
+@save_var path basis
+
+# Update training dataset by adding energy and force descriptors
+println("Computing energy descriptors of training dataset...")
+B_time = @elapsed e_descr_train = compute_local_descriptors(conf_train, basis)
+println("Computing force descriptors of training dataset...")
+dB_time = @elapsed f_descr_train = compute_force_descriptors(conf_train, basis)
+GC.gc()
+ds_train = DataSet(conf_train .+ e_descr_train .+ f_descr_train)
+
+# Learn
+println("Learning energies and forces...")
+lb = LBasisPotential(basis)
+ws, int = [1.0, 1.0], false
+learn!(lb, ds_train, ws, int)
+
+@save_var path lb.β
+@save_var path lb.β0
+lb.β, lb.β0
+
+# Post-process output: calculate metrics, create plots, and save results
+
+# Update test dataset by adding energy and force descriptors
+println("Computing energy descriptors of test dataset...")
+e_descr_test = compute_local_descriptors(conf_test, basis)
+println("Computing force descriptors of test dataset...")
+f_descr_test = compute_force_descriptors(conf_test, basis)
+GC.gc()
+ds_test = DataSet(conf_test .+ e_descr_test .+ f_descr_test)
+
+# Get true and predicted values
+n_atoms_train = length.(get_system.(ds_train))
+n_atoms_test = length.(get_system.(ds_test))
+
+e_train, e_train_pred = get_all_energies(ds_train) ./ n_atoms_train,
+                        get_all_energies(ds_train, lb) ./ n_atoms_train
+f_train, f_train_pred = get_all_forces(ds_train),
+                        get_all_forces(ds_train, lb)
+@save_var path e_train
+@save_var path e_train_pred
+@save_var path f_train
+@save_var path f_train_pred
+
+e_test, e_test_pred = get_all_energies(ds_test) ./ n_atoms_test,
+                      get_all_energies(ds_test, lb) ./ n_atoms_test
+f_test, f_test_pred = get_all_forces(ds_test),
+                      get_all_forces(ds_test, lb)
+@save_var path e_test
+@save_var path e_test_pred
+@save_var path f_test
+@save_var path f_test_pred
+
+# Compute training metrics
+e_train_metrics = get_metrics(e_train, e_train_pred,
+                              metrics = [mae, rmse, rsq],
+                              label = "e_train")
+f_train_metrics = get_metrics(f_train, f_train_pred,
+                              metrics = [mae, rmse, rsq, mean_cos],
+                              label = "f_train")
+train_metrics = merge(e_train_metrics, f_train_metrics)
+@save_dict path train_metrics
+train_metrics
+
+# Compute test metrics
+e_test_metrics = get_metrics(e_test, e_test_pred,
+                             metrics = [mae, rmse, rsq],
+                             label = "e_test")
+f_test_metrics = get_metrics(f_test, f_test_pred,
+                             metrics = [mae, rmse, rsq, mean_cos],
+                             label = "f_test")
+test_metrics = merge(e_test_metrics, f_test_metrics)
+@save_dict path test_metrics
+test_metrics
+
+# Plot and save energy results
+e_plot = plot_energy(e_train, e_train_pred,
+                     e_test, e_test_pred)
+@save_fig path e_plot
+e_plot
+
+# Plot and save force results
+f_plot = plot_forces(f_train, f_train_pred,
+                     f_test, f_test_pred)
+@save_fig path f_plot
+f_plot
+
+# Plot and save training force cosine
+e_train_plot = plot_energy(e_train, e_train_pred)
+f_train_plot = plot_forces(f_train, f_train_pred)
+f_train_cos  = plot_cos(f_train, f_train_pred)
+@save_fig path e_train_plot
+@save_fig path f_train_plot
+@save_fig path f_train_cos
+f_train_cos
+
+# Plot and save test force cosine
+e_test_plot = plot_energy(e_test, e_test_pred)
+f_test_plot = plot_forces(f_test, f_test_pred)
+f_test_cos  = plot_cos(f_test, f_test_pred)
+@save_fig path e_test_plot
+@save_fig path f_test_plot
+@save_fig path f_test_cos
+f_test_cos
+
diff --git a/examples/DPP-ACE-Si/Project.toml b/examples/DPP-ACE-Si/Project.toml
@@ -1,7 +1,6 @@
 [deps]
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Determinantal = "2673d5e8-682c-11e9-2dfd-471b09c6c819"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"

diff --git a/examples/DPP-ACE-Si/fit-dpp-ace-si.jl b/examples/DPP-ACE-Si/fit-dpp-ace-si.jl
@@ -3,14 +3,14 @@ using Statistics, StatsBase, Distributions, Determinantal
 using Unitful, UnitfulAtomic
 using AtomsBase, InteratomicPotentials, PotentialLearning
 using CSV, JLD, DataFrames
-using CairoMakie
 
-include("subsampling_utils.jl")
+path = joinpath(dirname(pathof(PotentialLearning)), "../examples/DPP-ACE-Si")
+
+include("$path/subsampling_utils.jl")
 
-# Load dataset -----------------------------------------------------------------
+# Load dataset
 elname = "Si"
 elspec = [:Si]
-path = joinpath(dirname(pathof(PotentialLearning)), "../examples/DPP-ACE-Si")
 inpath = "$path/../data/Si-3Body-LAMMPS/"
 outpath = "$path/output/$elname/"
 
@@ -29,11 +29,7 @@ for k = 1:nfile
     n += length(confs_arr[k])
 end
 
-# Read single file
-# datafile = "Hf_mp100_EOS_1D_form_sorted.xyz"
-# confs = load_data(inpath*datafile, ExtXYZ(u"eV", u"Å"))
-
-# Define ACE basis -------------------------------------------------------------
+# Define ACE basis
 nbody = 4
 deg = 5
 ace = ACE(species = elspec,             # species
@@ -44,7 +40,7 @@ ace = ACE(species = elspec,             # species
           r0 = 1.0,                     # minimum distance between atoms
           rcutoff = 10.0)
 
-# Update dataset by adding energy (local) descriptors --------------------------
+# Update dataset by adding energy and force descriptors
 println("Computing local descriptors")
 e_descr = compute_local_descriptors(confs, ace)
 f_descr = compute_force_descriptors(confs, ace)
@@ -54,14 +50,15 @@ JLD.save(outpath*"$(elname)_force_descriptors.jld", "f_descr", f_descr)
 ds = DataSet(confs .+ e_descr .+ f_descr)
 ndata = length(ds)
 
-# Compute cross validation error from training ---------------------------------
+# Compute cross validation error from training
 batch_size = [80, 40]
 sel_ind = Dict{Int64, Vector}()
 cond_num = Dict{Int64, Vector}()
 
 for bs in batch_size
     println("=============== Starting batch size $bs ===============")
     sel_ind[bs], cond_num[bs] = cross_validation_training(ds; ndiv=5, dpp_batch=bs)
+    println("condnum: $(cond_num[bs])")
 end
 
 JLD.save(outpath*"$(elname)_ACE-$(nbody)-$(deg)_DPP_indices_and_condnum.jld",

diff --git a/examples/data/a-HfO2/README.md b/examples/data/a-HfO2/README.md
@@ -0,0 +1 @@
+Source: https://github.com/argonne-lcf/active-learning-md