From d95189aa4fe1e38bc89ec2fbfa81314d261fc647 Mon Sep 17 00:00:00 2001 From: David Atienza Date: Tue, 10 Sep 2024 21:07:20 +0200 Subject: [PATCH 01/75] Release v0.5.1. --- CHANGELOG.md | 5 +++++ CMakeLists.txt | 24 +++++++++++++++++++----- README.md | 2 +- docs/source/changelog.rst | 5 +++++ docs/source/conf.py | 4 ++-- docs/source/pybnesian.rst | 2 +- overlay_ports/arrow/android.patch | 4 ++-- overlay_ports/arrow/portfile.cmake | 3 +-- overlay_ports/arrow/usage | 8 ++++---- overlay_ports/arrow/usage-acero | 6 +++--- overlay_ports/arrow/usage-dataset | 6 +++--- overlay_ports/arrow/usage-flight | 6 +++--- overlay_ports/arrow/usage-flightsql | 6 +++--- overlay_ports/arrow/usage-parquet | 6 +++--- overlay_ports/arrow/vcpkg.json | 2 +- pyproject.toml | 2 +- vcpkg | 2 +- 17 files changed, 58 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90c3bd98..2a6b8836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## v0.5.1 + +- Fixes vcpkg bad hashes ([vcpkg/#38974](https://github.com/microsoft/vcpkg/issues/38974)). +- Updates arrow to 17.0.0. + ## v0.5.0 - Changed the build process to statically link Apache Arrow. With this change and using the diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ecdf54c..52f06641 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,11 @@ IF(EXISTS ".git") SET(GIT_COMMAND_EXECUTED "{GIT_EXECUTABLE} submodule update --init --recursive") execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE GIT_SUBMOD_RESULT) + + IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") + ENDIF() ELSE() - SET(GIT_COMMIT_HASH "a1212c93c") SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git") execute_process(COMMAND ${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git WORKING_DIRECTORY "." RESULT_VARIABLE GIT_SUBMOD_RESULT) @@ -25,17 +28,28 @@ ELSE() IF(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") ENDIF() +ENDIF() - SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") - execute_process(COMMAND ${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH} - WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) +SET(GIT_COMMIT_HASH "2024.08.23") +SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") +execute_process(COMMAND ${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH} + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) + +IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") ENDIF() +SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} rev-parse HEAD") +execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT OUTPUT_VARIABLE GIT_STDOUT) + IF(NOT GIT_SUBMOD_RESULT EQUAL "0") - message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}, please checkout submodules.") + message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") ENDIF() +message("Git commit in vcpkg: ${GIT_STDOUT}") + set(CMAKE_TOOLCHAIN_FILE "vcpkg/scripts/buildsystems/vcpkg.cmake") project(pybnesian VERSION ${SKBUILD_PROJECT_VERSION} LANGUAGES CXX) diff --git a/README.md b/README.md index 56c6b991..eba05f6e 100644 --- a/README.md +++ b/README.md @@ -305,7 +305,7 @@ Clone the repository: ``` git clone https://github.com/davenza/PyBNesian.git cd PyBNesian -git checkout v0.5.0 # You can checkout a specific version if you want +git checkout v0.5.1 # You can checkout a specific version if you want pip install . ``` diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 00934033..38042515 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -2,6 +2,11 @@ Changelog ********* +v0.5.1 +====== + +- Fixes vcpkg bad hashes for boost-core (`vcpkg/#38974 `_). +- Updates arrow to 17.0.0. v0.5.0 ====== diff --git a/docs/source/conf.py b/docs/source/conf.py index 065aba39..39539f5e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,8 +22,8 @@ author = 'David Atienza' # The full version, including alpha/beta/rc tags -version = '0.5.0' -release = '0.5.0' +version = '0.5.1' +release = '0.5.1' # -- General configuration --------------------------------------------------- diff --git a/docs/source/pybnesian.rst b/docs/source/pybnesian.rst index 2640325b..a5644d30 100644 --- a/docs/source/pybnesian.rst +++ b/docs/source/pybnesian.rst @@ -67,7 +67,7 @@ Clone the repository: git clone https://github.com/davenza/PyBNesian.git cd PyBNesian - git checkout v0.5.0 # You can checkout a specific version if you want + git checkout v0.5.1 # You can checkout a specific version if you want pip install . Testing diff --git a/overlay_ports/arrow/android.patch b/overlay_ports/arrow/android.patch index 12f704db..b228d888 100644 --- a/overlay_ports/arrow/android.patch +++ b/overlay_ports/arrow/android.patch @@ -1,9 +1,9 @@ diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 026bb5c..5c1b5e3 100644 +index 6dc8358..2b91efa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -166,7 +166,7 @@ if(WIN32) - list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32.dll") + list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32") endif() -if(NOT WIN32 AND NOT APPLE) diff --git a/overlay_ports/arrow/portfile.cmake b/overlay_ports/arrow/portfile.cmake index 0da43079..786ca065 100644 --- a/overlay_ports/arrow/portfile.cmake +++ b/overlay_ports/arrow/portfile.cmake @@ -2,7 +2,7 @@ vcpkg_download_distfile( ARCHIVE_PATH URLS "https://archive.apache.org/dist/arrow/arrow-${VERSION}/apache-arrow-${VERSION}.tar.gz" FILENAME apache-arrow-${VERSION}.tar.gz - SHA512 28975f59e1fdde2dba4afaf4a5ba934b63db3a7f27656e2aa0af0f0d2a046c9dbfa9a6082de94629c36d03809b296566a37ea65ec5a2fc17fedac7d21e272d31 + SHA512 4e2a617b8deeb9f94ee085653a721904a75696f0827bcba82b535cc7f4f723066a09914c7fa83c593e51a8a4031e8bf99e563cac1ebb1d89604cb406975d4864 ) vcpkg_extract_source_archive( SOURCE_PATH @@ -12,7 +12,6 @@ vcpkg_extract_source_archive( msvc-static-name.patch utf8proc.patch thrift.patch - remove-dll-suffix.patch #Upstream PR: https://github.com/apache/arrow/pull/41341 ) vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS diff --git a/overlay_ports/arrow/usage b/overlay_ports/arrow/usage index b07f1c9c..a0043630 100644 --- a/overlay_ports/arrow/usage +++ b/overlay_ports/arrow/usage @@ -1,4 +1,4 @@ -The package arrow provides CMake targets: - - find_package(Arrow CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,Arrow::arrow_static,Arrow::arrow_shared>") +The package arrow provides CMake targets: + + find_package(Arrow CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,Arrow::arrow_static,Arrow::arrow_shared>") diff --git a/overlay_ports/arrow/usage-acero b/overlay_ports/arrow/usage-acero index 05c4776b..bde5ef80 100644 --- a/overlay_ports/arrow/usage-acero +++ b/overlay_ports/arrow/usage-acero @@ -1,3 +1,3 @@ - - find_package(ArrowAcero CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,ArrowAcero::arrow_acero_static,ArrowAcero::arrow_acero_shared>") + + find_package(ArrowAcero CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,ArrowAcero::arrow_acero_static,ArrowAcero::arrow_acero_shared>") diff --git a/overlay_ports/arrow/usage-dataset b/overlay_ports/arrow/usage-dataset index 8bedfb83..dcb9bda4 100644 --- a/overlay_ports/arrow/usage-dataset +++ b/overlay_ports/arrow/usage-dataset @@ -1,3 +1,3 @@ - - find_package(ArrowDataset CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,ArrowDataset::arrow_dataset_static,ArrowDataset::arrow_dataset_shared>") + + find_package(ArrowDataset CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,ArrowDataset::arrow_dataset_static,ArrowDataset::arrow_dataset_shared>") diff --git a/overlay_ports/arrow/usage-flight b/overlay_ports/arrow/usage-flight index c9656253..69abcf6c 100644 --- a/overlay_ports/arrow/usage-flight +++ b/overlay_ports/arrow/usage-flight @@ -1,3 +1,3 @@ - - find_package(ArrowFlight CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,ArrowFlight::arrow_flight_static,ArrowFlight::arrow_flight_shared>") + + find_package(ArrowFlight CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,ArrowFlight::arrow_flight_static,ArrowFlight::arrow_flight_shared>") diff --git a/overlay_ports/arrow/usage-flightsql b/overlay_ports/arrow/usage-flightsql index 3159c059..2f7e9cf1 100644 --- a/overlay_ports/arrow/usage-flightsql +++ b/overlay_ports/arrow/usage-flightsql @@ -1,3 +1,3 @@ - - find_package(ArrowFlightSql CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,ArrowFlightSql::arrow_flight_sql_static,ArrowFlightSql::arrow_flight_sql_shared>") + + find_package(ArrowFlightSql CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,ArrowFlightSql::arrow_flight_sql_static,ArrowFlightSql::arrow_flight_sql_shared>") diff --git a/overlay_ports/arrow/usage-parquet b/overlay_ports/arrow/usage-parquet index ce04039a..ddec5c5a 100644 --- a/overlay_ports/arrow/usage-parquet +++ b/overlay_ports/arrow/usage-parquet @@ -1,3 +1,3 @@ - - find_package(Parquet CONFIG REQUIRED) - target_link_libraries(main PRIVATE "$,Parquet::parquet_static,Parquet::parquet_shared>") + + find_package(Parquet CONFIG REQUIRED) + target_link_libraries(main PRIVATE "$,Parquet::parquet_static,Parquet::parquet_shared>") diff --git a/overlay_ports/arrow/vcpkg.json b/overlay_ports/arrow/vcpkg.json index 7b6af5db..a9f5c4e3 100644 --- a/overlay_ports/arrow/vcpkg.json +++ b/overlay_ports/arrow/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version": "16.1.0", + "version": "17.0.0", "description": "Cross-language development platform for in-memory analytics", "homepage": "https://arrow.apache.org", "license": "Apache-2.0", diff --git a/pyproject.toml b/pyproject.toml index 4f5d51c8..1be96df0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ sdist.exclude = ["vcpkg/*", "docs/"] name = "pybnesian" authors = [{name = "David Atienza", email = "datienza@fi.upm.es"}] description="PyBNesian is a Python package that implements Bayesian networks." -version = "0.5.0" +version = "0.5.1" readme = {file = "README.md", content-type = "text/markdown"} license = { file = "LICENSE" } requires-python = ">=3.8" diff --git a/vcpkg b/vcpkg index 7eb700c9..35089851 160000 --- a/vcpkg +++ b/vcpkg @@ -1 +1 @@ -Subproject commit 7eb700c9688daed6d8bdcdc571ebe3eedea6a774 +Subproject commit 3508985146f1b1d248c67ead13f8f54be5b4f5da From 0e38801622931a2c0a4bbf0a81024684cdb43dab Mon Sep 17 00:00:00 2001 From: David Atienza Date: Tue, 10 Sep 2024 21:27:39 +0200 Subject: [PATCH 02/75] Updates cibuildwheel to support manylinux. --- .github/workflows/release.yml | 2 +- .github/workflows/test_release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cca1d2cd..206feae4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,7 +36,7 @@ jobs: 7z x lightOCLSDK.zip -oOCLSDK - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.18.0 + run: python -m pip install cibuildwheel - name: Build wheels if: runner.os != 'macOS' diff --git a/.github/workflows/test_release.yml b/.github/workflows/test_release.yml index d657e34a..67fdb4dc 100644 --- a/.github/workflows/test_release.yml +++ b/.github/workflows/test_release.yml @@ -32,7 +32,7 @@ jobs: 7z x lightOCLSDK.zip -oOCLSDK - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.18.0 + run: python -m pip install cibuildwheel - name: Build wheels if: runner.os != 'macOS' From cdb619cb6da99eb47c84f34dd2e8858be253416f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 10 Sep 2024 22:28:59 +0200 Subject: [PATCH 03/75] Python code refactored with "black" --- conv_template.py | 92 ++-- docs/source/conf.py | 43 +- expand_sources.py | 23 +- lib/eigen-3.3.7/debug/gdb/printers.py | 371 +++++++------- tests/conftest.py | 1 + tests/dataset/crossvalidation_test.py | 265 +++++++--- tests/dataset/holdout_test.py | 111 ++-- tests/factors/continuous/CKDE_test.py | 477 ++++++++++++------ tests/factors/continuous/KDE_test.py | 260 ++++++---- .../continuous/LinearGaussianCPD_test.py | 328 ++++++++---- tests/factors/continuous/ProductKDE_test.py | 338 +++++++++---- tests/factors/discrete/DiscreteFactor_test.py | 30 +- tests/factors/factor_type_test.py | 11 +- tests/helpers/util_test.py | 245 +++++---- tests/learning/algorithms/constraint_test.py | 36 +- .../learning/algorithms/hillclimbing_test.py | 133 +++-- tests/learning/operators/operatorpool_test.py | 10 +- tests/learning/operators/operators_test.py | 65 +-- tests/learning/operators/operatorset_test.py | 22 +- .../operators/operatorstabuset_test.py | 7 +- tests/learning/parameters/mle_test.py | 8 +- tests/learning/scores/bic_test.py | 120 +++-- tests/learning/scores/cvlikelihood_test.py | 351 ++++++++----- .../learning/scores/holdoutlikelihood_test.py | 384 ++++++++++---- tests/models/BayesianNetwork_test.py | 388 +++++++------- tests/models/BayesianNetwork_type_test.py | 32 +- tests/models/DynamicBayesianNetwork_test.py | 63 ++- tests/models/HeterogeneousBN_test.py | 82 +-- tests/models/SemiparametricBN_test.py | 195 ++++--- tests/serialization/serialize_factor_test.py | 91 +++- .../serialize_factor_type_test.py | 18 +- tests/serialization/serialize_models_test.py | 355 +++++++++---- .../serialize_models_type_test.py | 22 +- 33 files changed, 3243 insertions(+), 1734 deletions(-) diff --git a/conv_template.py b/conv_template.py index 9f13d9b3..baf0371e 100644 --- a/conv_template.py +++ b/conv_template.py @@ -62,7 +62,7 @@ 3, 3, jim """ -__all__ = ['process_str', 'process_file'] +__all__ = ["process_str", "process_file"] import os import sys @@ -72,8 +72,7 @@ global_names = {} # header placed at the front of head processed file -header =\ -""" +header = """ /* ***************************************************************************** ** This file was autogenerated from a template DO NOT EDIT!!!! ** @@ -81,16 +80,18 @@ ***************************************************************************** */ """ + + # Parse string for repeat loops def parse_structure(astr, level): """ The returned line number is from the beginning of the string, starting at zero. Returns an empty list if no loops found. """ - if level == 0 : + if level == 0: loopbeg = "/**begin repeat" loopend = "/**end repeat**/" - else : + else: loopbeg = "/**begin repeat%d" % level loopend = "/**end repeat%d**/" % level @@ -105,9 +106,9 @@ def parse_structure(astr, level): start2 = astr.find("\n", start2) fini1 = astr.find(loopend, start2) fini2 = astr.find("\n", fini1) - line += astr.count("\n", ind, start2+1) - spanlist.append((start, start2+1, fini1, fini2+1, line)) - line += astr.count("\n", start2+1, fini2) + line += astr.count("\n", ind, start2 + 1) + spanlist.append((start, start2 + 1, fini1, fini2 + 1, line)) + line += astr.count("\n", start2 + 1, fini2) ind = fini2 spanlist.sort() return spanlist @@ -116,10 +117,13 @@ def parse_structure(astr, level): def paren_repl(obj): torep = obj.group(1) numrep = obj.group(2) - return ','.join([torep]*int(numrep)) + return ",".join([torep] * int(numrep)) + parenrep = re.compile(r"\(([^)]*)\)\*(\d+)") plainrep = re.compile(r"([^*]+)\*(\d+)") + + def parse_values(astr): # replaces all occurrences of '(a,b,c)*4' in astr # with 'a,b,c,a,b,c,a,b,c,a,b,c'. Empty braces generate @@ -127,16 +131,17 @@ def parse_values(astr): # split at ',' and a list of values returned. astr = parenrep.sub(paren_repl, astr) # replaces occurrences of xxx*3 with xxx, xxx, xxx - astr = ','.join([plainrep.sub(paren_repl, x.strip()) - for x in astr.split(',')]) - return astr.split(',') + astr = ",".join([plainrep.sub(paren_repl, x.strip()) for x in astr.split(",")]) + return astr.split(",") stripast = re.compile(r"\n\s*\*?") named_re = re.compile(r"#\s*(\w*)\s*=([^#]*)#") exclude_vars_re = re.compile(r"(\w*)=(\w*)") exclude_re = re.compile(":exclude:") -def parse_loop_header(loophead) : + + +def parse_loop_header(loophead): """Find all named replacements in the header Returns a list of dictionaries, one for each loop iteration, where each key is a name to be substituted and the corresponding @@ -157,86 +162,91 @@ def parse_loop_header(loophead) : name = rep[0] vals = parse_values(rep[1]) size = len(vals) - if nsub is None : + if nsub is None: nsub = size - elif nsub != size : + elif nsub != size: msg = "Mismatch in number of values, %d != %d\n%s = %s" raise ValueError(msg % (nsub, size, name, vals)) names.append((name, vals)) - # Find any exclude variables excludes = [] for obj in exclude_re.finditer(loophead): span = obj.span() # find next newline - endline = loophead.find('\n', span[1]) - substr = loophead[span[1]:endline] + endline = loophead.find("\n", span[1]) + substr = loophead[span[1] : endline] ex_names = exclude_vars_re.findall(substr) excludes.append(dict(ex_names)) # generate list of dictionaries, one for each template iteration dlist = [] - if nsub is None : + if nsub is None: raise ValueError("No substitution variables found") for i in range(nsub): tmp = {name: vals[i] for name, vals in names} dlist.append(tmp) return dlist + replace_re = re.compile(r"@(\w+)@") -def parse_string(astr, env, level, line) : + + +def parse_string(astr, env, level, line): lineno = "#line %d\n" % line # local function for string replacement, uses env def replace(match): name = match.group(1) - try : + try: val = env[name] except KeyError: - msg = 'line %d: no definition of key "%s"'%(line, name) + msg = 'line %d: no definition of key "%s"' % (line, name) raise ValueError(msg) from None return val code = [lineno] struct = parse_structure(astr, level) - if struct : + if struct: # recurse over inner loops oldend = 0 newlevel = level + 1 for sub in struct: - pref = astr[oldend:sub[0]] - head = astr[sub[0]:sub[1]] - text = astr[sub[1]:sub[2]] + pref = astr[oldend : sub[0]] + head = astr[sub[0] : sub[1]] + text = astr[sub[1] : sub[2]] oldend = sub[3] newline = line + sub[4] code.append(replace_re.sub(replace, pref)) - try : + try: envlist = parse_loop_header(head) except ValueError as e: msg = "line %d: %s" % (newline, e) raise ValueError(msg) - for newenv in envlist : + for newenv in envlist: newenv.update(env) newcode = parse_string(text, newenv, newlevel, newline) code.extend(newcode) suff = astr[oldend:] code.append(replace_re.sub(replace, suff)) - else : + else: # replace keys code.append(replace_re.sub(replace, astr)) - code.append('\n') - return ''.join(code) + code.append("\n") + return "".join(code) + def process_str(astr): code = [header] code.extend(parse_string(astr, global_names, 0, 1)) - return ''.join(code) + return "".join(code) + +include_src_re = re.compile( + r"(\n|\A)#include\s*['\"]" r"(?P[\w\d./\\]+[.]src)['\"]", re.I +) -include_src_re = re.compile(r"(\n|\A)#include\s*['\"]" - r"(?P[\w\d./\\]+[.]src)['\"]", re.I) def resolve_includes(source): d = os.path.dirname(source) @@ -245,11 +255,11 @@ def resolve_includes(source): for line in fid: m = include_src_re.match(line) if m: - fn = m.group('name') + fn = m.group("name") if not os.path.isabs(fn): fn = os.path.join(d, fn) if os.path.isfile(fn): - print('Including file', fn) + print("Including file", fn) lines.extend(resolve_includes(fn)) else: lines.append(line) @@ -257,11 +267,12 @@ def resolve_includes(source): lines.append(line) return lines + def process_file(source): lines = resolve_includes(source) sourcefile = os.path.normcase(source).replace("\\", "\\\\") try: - code = process_str(''.join(lines)) + code = process_str("".join(lines)) except ValueError as e: raise ValueError('In "%s" loop at %s' % (sourcefile, e)) from None return '#line 1 "%s"\n%s' % (sourcefile, code) @@ -291,10 +302,10 @@ def main(): fid = sys.stdin outfile = sys.stdout else: - fid = open(file, 'r') + fid = open(file, "r") (base, ext) = os.path.splitext(file) newname = base - outfile = open(newname, 'w') + outfile = open(newname, "w") allstr = fid.read() try: @@ -304,5 +315,6 @@ def main(): outfile.write(writestr) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/docs/source/conf.py b/docs/source/conf.py index 39539f5e..7cc18217 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,13 +17,13 @@ # -- Project information ----------------------------------------------------- -project = 'PyBNesian' -copyright = '2024, David Atienza' -author = 'David Atienza' +project = "PyBNesian" +copyright = "2024, David Atienza" +author = "David Atienza" # The full version, including alpha/beta/rc tags -version = '0.5.1' -release = '0.5.1' +version = "0.5.1" +release = "0.5.1" # -- General configuration --------------------------------------------------- @@ -32,20 +32,21 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx.ext.doctest', - 'sphinx.ext.mathjax', - 'sphinx_rtd_theme'] + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.doctest", + "sphinx.ext.mathjax", + "sphinx_rtd_theme", +] autosummary_generate = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] -source_suffix = '.rst' +source_suffix = ".rst" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -53,13 +54,13 @@ exclude_patterns = [] # Removes the module prefix of the class definition -#add_module_names = False +# add_module_names = False intersphinx_mapping = { - 'pyarrow': ('https://arrow.apache.org/docs/', None), - 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'numpy': ('https://numpy.org/doc/stable/', None), - 'pickle': ('https://docs.python.org/3/', None) + "pyarrow": ("https://arrow.apache.org/docs/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pickle": ("https://docs.python.org/3/", None), } # -- Options for HTML output ------------------------------------------------- @@ -67,9 +68,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ["_static"] diff --git a/expand_sources.py b/expand_sources.py index 1850d01f..52c24db2 100644 --- a/expand_sources.py +++ b/expand_sources.py @@ -4,32 +4,33 @@ def expand_sources(): - sources = ['pybnesian/kde/opencl_kernels/KDE.cl.src'] - + sources = ["pybnesian/kde/opencl_kernels/KDE.cl.src"] + for source in sources: (base, _) = os.path.splitext(source) outstr = conv_template.process_file(source) - with open(base, 'w') as fid: + with open(base, "w") as fid: fid.write(outstr) def copy_opencl_code(): - sources = ['pybnesian/kde/opencl_kernels/KDE.cl'] + sources = ["pybnesian/kde/opencl_kernels/KDE.cl"] # Split the CPP code because the MSVC only allow strings of a max size. # Error C2026: https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026?view=msvc-160 - MAX_LENGTH=16378 + MAX_LENGTH = 16378 code_str = "" for source in sources: - code_str += '\n' + code_str += "\n" with open(source) as f: source_code = f.read() code_str += source_code - fragments = [code_str[i:(i + MAX_LENGTH)] for i in range(0, len(code_str), MAX_LENGTH)] + fragments = [ + code_str[i : (i + MAX_LENGTH)] for i in range(0, len(code_str), MAX_LENGTH) + ] - cpp_code = \ - """#ifndef PYBNESIAN_OPENCL_OPENCL_CODE_HPP + cpp_code = """#ifndef PYBNESIAN_OPENCL_OPENCL_CODE_HPP #define PYBNESIAN_OPENCL_OPENCL_CODE_HPP namespace opencl { @@ -42,10 +43,10 @@ def copy_opencl_code(): } #endif //PYBNESIAN_OPENCL_OPENCL_CODE_HPP""" - with open('pybnesian/opencl/opencl_code.hpp', 'w') as f: + with open("pybnesian/opencl/opencl_code.hpp", "w") as f: f.write(cpp_code) if __name__ == "__main__": expand_sources() - copy_opencl_code() \ No newline at end of file + copy_opencl_code() diff --git a/lib/eigen-3.3.7/debug/gdb/printers.py b/lib/eigen-3.3.7/debug/gdb/printers.py index 0d67a5f9..b6ab74a7 100644 --- a/lib/eigen-3.3.7/debug/gdb/printers.py +++ b/lib/eigen-3.3.7/debug/gdb/printers.py @@ -9,14 +9,14 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # Pretty printers for Eigen::Matrix -# This is still pretty basic as the python extension to gdb is still pretty basic. +# This is still pretty basic as the python extension to gdb is still pretty basic. # It cannot handle complex eigen types and it doesn't support any of the other eigen types -# Such as quaternion or some other type. +# Such as quaternion or some other type. # This code supports fixed size as well as dynamic size matrices # To use it: # -# * Create a directory and put the file as well as an empty __init__.py in +# * Create a directory and put the file as well as an empty __init__.py in # that directory. # * Create a ~/.gdbinit file, that contains the following: # python @@ -32,183 +32,208 @@ class EigenMatrixPrinter: - "Print Eigen Matrix or Array of some kind" - - def __init__(self, variety, val): - "Extract all the necessary information" - - # Save the variety (presumably "Matrix" or "Array") for later usage - self.variety = variety - - # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() - tag = self.type.tag - regex = re.compile('\<.*\>') - m = regex.findall(tag)[0][1:-1] - template_params = m.split(',') - template_params = [x.replace(" ", "") for x in template_params] - - if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1': - self.rows = val['m_storage']['m_rows'] - else: - self.rows = int(template_params[1]) - - if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1': - self.cols = val['m_storage']['m_cols'] - else: - self.cols = int(template_params[2]) - - self.options = 0 # default value - if len(template_params) > 3: - self.options = template_params[3]; - - self.rowMajor = (int(self.options) & 0x1) - - self.innerType = self.type.template_argument(0) - - self.val = val - - # Fixed size matrices have a struct as their storage, so we need to walk through this - self.data = self.val['m_storage']['m_data'] - if self.data.type.code == gdb.TYPE_CODE_STRUCT: - self.data = self.data['array'] - self.data = self.data.cast(self.innerType.pointer()) - - class _iterator: - def __init__ (self, rows, cols, dataPtr, rowMajor): - self.rows = rows - self.cols = cols - self.dataPtr = dataPtr - self.currentRow = 0 - self.currentCol = 0 - self.rowMajor = rowMajor - - def __iter__ (self): - return self - - def next(self): - return self.__next__() # Python 2.x compatibility - - def __next__(self): - - row = self.currentRow - col = self.currentCol - if self.rowMajor == 0: - if self.currentCol >= self.cols: - raise StopIteration - - self.currentRow = self.currentRow + 1 - if self.currentRow >= self.rows: - self.currentRow = 0 - self.currentCol = self.currentCol + 1 - else: - if self.currentRow >= self.rows: - raise StopIteration - - self.currentCol = self.currentCol + 1 - if self.currentCol >= self.cols: - self.currentCol = 0 - self.currentRow = self.currentRow + 1 - - - item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - if (self.cols == 1): #if it's a column vector - return ('[%d]' % (row,), item) - elif (self.rows == 1): #if it's a row vector - return ('[%d]' % (col,), item) - return ('[%d,%d]' % (row, col), item) - - def children(self): - - return self._iterator(self.rows, self.cols, self.data, self.rowMajor) - - def to_string(self): - return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else "ColMajor", self.data) + "Print Eigen Matrix or Array of some kind" + + def __init__(self, variety, val): + "Extract all the necessary information" + + # Save the variety (presumably "Matrix" or "Array") for later usage + self.variety = variety + + # The gdb extension does not support value template arguments - need to extract them by hand + type = val.type + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + self.type = type.unqualified().strip_typedefs() + tag = self.type.tag + regex = re.compile("\<.*\>") + m = regex.findall(tag)[0][1:-1] + template_params = m.split(",") + template_params = [x.replace(" ", "") for x in template_params] + + if ( + template_params[1] == "-0x00000000000000001" + or template_params[1] == "-0x000000001" + or template_params[1] == "-1" + ): + self.rows = val["m_storage"]["m_rows"] + else: + self.rows = int(template_params[1]) + + if ( + template_params[2] == "-0x00000000000000001" + or template_params[2] == "-0x000000001" + or template_params[2] == "-1" + ): + self.cols = val["m_storage"]["m_cols"] + else: + self.cols = int(template_params[2]) + + self.options = 0 # default value + if len(template_params) > 3: + self.options = template_params[3] + + self.rowMajor = int(self.options) & 0x1 + + self.innerType = self.type.template_argument(0) + + self.val = val + + # Fixed size matrices have a struct as their storage, so we need to walk through this + self.data = self.val["m_storage"]["m_data"] + if self.data.type.code == gdb.TYPE_CODE_STRUCT: + self.data = self.data["array"] + self.data = self.data.cast(self.innerType.pointer()) + + class _iterator: + def __init__(self, rows, cols, dataPtr, rowMajor): + self.rows = rows + self.cols = cols + self.dataPtr = dataPtr + self.currentRow = 0 + self.currentCol = 0 + self.rowMajor = rowMajor + + def __iter__(self): + return self + + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): + + row = self.currentRow + col = self.currentCol + if self.rowMajor == 0: + if self.currentCol >= self.cols: + raise StopIteration + + self.currentRow = self.currentRow + 1 + if self.currentRow >= self.rows: + self.currentRow = 0 + self.currentCol = self.currentCol + 1 + else: + if self.currentRow >= self.rows: + raise StopIteration + + self.currentCol = self.currentCol + 1 + if self.currentCol >= self.cols: + self.currentCol = 0 + self.currentRow = self.currentRow + 1 + + item = self.dataPtr.dereference() + self.dataPtr = self.dataPtr + 1 + if self.cols == 1: # if it's a column vector + return ("[%d]" % (row,), item) + elif self.rows == 1: # if it's a row vector + return ("[%d]" % (col,), item) + return ("[%d,%d]" % (row, col), item) + + def children(self): + + return self._iterator(self.rows, self.cols, self.data, self.rowMajor) + + def to_string(self): + return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % ( + self.variety, + self.innerType, + self.rows, + self.cols, + "RowMajor" if self.rowMajor else "ColMajor", + self.data, + ) + class EigenQuaternionPrinter: - "Print an Eigen Quaternion" - - def __init__(self, val): - "Extract all the necessary information" - # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() - self.innerType = self.type.template_argument(0) - self.val = val - - # Quaternions have a struct as their storage, so we need to walk through this - self.data = self.val['m_coeffs']['m_storage']['m_data']['array'] - self.data = self.data.cast(self.innerType.pointer()) - - class _iterator: - def __init__ (self, dataPtr): - self.dataPtr = dataPtr - self.currentElement = 0 - self.elementNames = ['x', 'y', 'z', 'w'] - - def __iter__ (self): - return self - - def next(self): - return self.__next__() # Python 2.x compatibility - - def __next__(self): - element = self.currentElement - - if self.currentElement >= 4: #there are 4 elements in a quanternion - raise StopIteration - - self.currentElement = self.currentElement + 1 - - item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - return ('[%s]' % (self.elementNames[element],), item) - - def children(self): - - return self._iterator(self.data) - - def to_string(self): - return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data) - -def build_eigen_dictionary (): - pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val) - pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val) - pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val) + "Print an Eigen Quaternion" + + def __init__(self, val): + "Extract all the necessary information" + # The gdb extension does not support value template arguments - need to extract them by hand + type = val.type + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + self.type = type.unqualified().strip_typedefs() + self.innerType = self.type.template_argument(0) + self.val = val + + # Quaternions have a struct as their storage, so we need to walk through this + self.data = self.val["m_coeffs"]["m_storage"]["m_data"]["array"] + self.data = self.data.cast(self.innerType.pointer()) + + class _iterator: + def __init__(self, dataPtr): + self.dataPtr = dataPtr + self.currentElement = 0 + self.elementNames = ["x", "y", "z", "w"] + + def __iter__(self): + return self + + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): + element = self.currentElement + + if self.currentElement >= 4: # there are 4 elements in a quanternion + raise StopIteration + + self.currentElement = self.currentElement + 1 + + item = self.dataPtr.dereference() + self.dataPtr = self.dataPtr + 1 + return ("[%s]" % (self.elementNames[element],), item) + + def children(self): + + return self._iterator(self.data) + + def to_string(self): + return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data) + + +def build_eigen_dictionary(): + pretty_printers_dict[re.compile("^Eigen::Quaternion<.*>$")] = ( + lambda val: EigenQuaternionPrinter(val) + ) + pretty_printers_dict[re.compile("^Eigen::Matrix<.*>$")] = ( + lambda val: EigenMatrixPrinter("Matrix", val) + ) + pretty_printers_dict[re.compile("^Eigen::Array<.*>$")] = ( + lambda val: EigenMatrixPrinter("Array", val) + ) + def register_eigen_printers(obj): - "Register eigen pretty-printers with objfile Obj" + "Register eigen pretty-printers with objfile Obj" + + if obj == None: + obj = gdb + obj.pretty_printers.append(lookup_function) - if obj == None: - obj = gdb - obj.pretty_printers.append(lookup_function) def lookup_function(val): - "Look-up and return a pretty-printer that can print va." - - type = val.type - - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - - type = type.unqualified().strip_typedefs() - - typename = type.tag - if typename == None: - return None - - for function in pretty_printers_dict: - if function.search(typename): - return pretty_printers_dict[function](val) - - return None + "Look-up and return a pretty-printer that can print va." + + type = val.type + + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + + type = type.unqualified().strip_typedefs() + + typename = type.tag + if typename == None: + return None + + for function in pretty_printers_dict: + if function.search(typename): + return pretty_printers_dict[function](val) + + return None + pretty_printers_dict = {} -build_eigen_dictionary () +build_eigen_dictionary() diff --git a/tests/conftest.py b/tests/conftest.py index e6567dda..556b6c7c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ # Solution to import helper script by this answer https://stackoverflow.com/a/33515264 import os import sys + sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 572314d8..4f58420b 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -7,6 +7,7 @@ df = util_test.generate_normal_data(SIZE) + def test_cv_disjoint_indices(): cv = pbn.CrossValidation(df) @@ -14,17 +15,29 @@ def test_cv_disjoint_indices(): nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) - - assert np.all(np.sort(combination) == np.arange(SIZE)), "Not all the examples are included in the cross validation." - assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" + assert np.all( + np.sort(combination) == np.arange(SIZE) + ), "Not all the examples are included in the cross validation." + assert np.all( + train_df.to_pandas().to_numpy() == df.iloc[train_indices, :].to_numpy() + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + assert np.all( + test_df.to_pandas().to_numpy() == df.iloc[test_indices, :].to_numpy() + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" def test_cv_fold(): @@ -33,35 +46,48 @@ def test_cv_fold(): for i, (train_df, test_df) in enumerate(cv): train_fold, test_fold = cv.fold(i) - assert train_fold.equals(train_df), "Train DataFrame fold() and __iter__ are not equal." - assert test_fold.equals(test_df), "Test DataFrame fold() and __iter__ are not equal." + assert train_fold.equals( + train_df + ), "Train DataFrame fold() and __iter__ are not equal." + assert test_fold.equals( + test_df + ), "Test DataFrame fold() and __iter__ are not equal." def test_cv_seed(): cv = pbn.CrossValidation(df, seed=0) - + dataframes = list(cv) cv2 = pbn.CrossValidation(df, seed=0) for (train_cv, test_cv), (train_cv2, test_cv2) in zip(dataframes, cv2): - assert train_cv.equals(train_cv2), "Train CV DataFrames with the same seed are not equal." - assert test_cv.equals(test_cv2), "Test CV DataFrames with the same seed are not equal." + assert train_cv.equals( + train_cv2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_cv.equals( + test_cv2 + ), "Test CV DataFrames with the same seed are not equal." cv3 = pbn.CrossValidation(df, seed=1) for (train_cv2, test_cv2), (train_cv3, test_cv3) in zip(cv2, cv3): - assert not train_cv2.equals(train_cv3), "Train CV DataFrames with different seeds return the same result." - assert not test_cv2.equals(test_cv3), "Test CV DataFrames with different seeds return the same result." + assert not train_cv2.equals( + train_cv3 + ), "Train CV DataFrames with different seeds return the same result." + assert not test_cv2.equals( + test_cv3 + ), "Test CV DataFrames with different seeds return the same result." + def test_cv_num_folds(): cv = pbn.CrossValidation(df) - + dataframes = list(cv) indices = list(cv.indices()) assert len(dataframes) == 10, "Default number of folds must be 10." assert len(indices) == 10, "Default number of folds must be 10." - + cv5 = pbn.CrossValidation(df, 5) dataframes = list(cv5) indices = list(cv5.indices()) @@ -71,38 +97,74 @@ def test_cv_num_folds(): def test_cv_loc(): cv = pbn.CrossValidation(df) - - for (train_df, test_df) in cv.loc("a"): - assert train_df.num_columns == 1, "Only column \"a\" must be present in train DataFrame." - assert test_df.num_columns == 1, "Only column \"a\" must be present in test DataFrame." + + for train_df, test_df in cv.loc("a"): + assert ( + train_df.num_columns == 1 + ), 'Only column "a" must be present in train DataFrame.' + assert ( + test_df.num_columns == 1 + ), 'Only column "a" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["a"], "Only column \"a\" must be present in train DataFrame." - assert test_schema.names == ["a"], "Only column \"a\" must be present in test DataFrame." - - for (train_df, test_df) in cv.loc(1): - assert train_df.num_columns == 1, "Only column \"b\" must be present in train DataFrame." - assert test_df.num_columns == 1, "Only column \"b\" must be present in test DataFrame." + assert train_schema.names == [ + "a" + ], 'Only column "a" must be present in train DataFrame.' + assert test_schema.names == [ + "a" + ], 'Only column "a" must be present in test DataFrame.' + + for train_df, test_df in cv.loc(1): + assert ( + train_df.num_columns == 1 + ), 'Only column "b" must be present in train DataFrame.' + assert ( + test_df.num_columns == 1 + ), 'Only column "b" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["b"], "Only column \"b\" must be present in train DataFrame." - assert test_schema.names == ["b"], "Only column \"b\" must be present in test DataFrame." - - for (train_df, test_df) in cv.loc(["b", "d"]): - assert train_df.num_columns == 2, "Only columns [\"b\", \"d\"] must be present in train DataFrame." - assert test_df.num_columns == 2, "Only column [\"b\", \"d\"] must be present in test DataFrame." + assert train_schema.names == [ + "b" + ], 'Only column "b" must be present in train DataFrame.' + assert test_schema.names == [ + "b" + ], 'Only column "b" must be present in test DataFrame.' + + for train_df, test_df in cv.loc(["b", "d"]): + assert ( + train_df.num_columns == 2 + ), 'Only columns ["b", "d"] must be present in train DataFrame.' + assert ( + test_df.num_columns == 2 + ), 'Only column ["b", "d"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["b", "d"], "Only column [\"b\", \"d\"] must be present in train DataFrame." - assert test_schema.names == ["b", "d"], "Only column [\"b\", \"d\"] must be present in test DataFrame." - - for (train_df, test_df) in cv.loc([0, 2]): - assert train_df.num_columns == 2, "Only columns [\"a\", \"c\"] must be present in train DataFrame." - assert test_df.num_columns == 2, "Only column [\"a\", \"c\"] must be present in test DataFrame." + assert train_schema.names == [ + "b", + "d", + ], 'Only column ["b", "d"] must be present in train DataFrame.' + assert test_schema.names == [ + "b", + "d", + ], 'Only column ["b", "d"] must be present in test DataFrame.' + + for train_df, test_df in cv.loc([0, 2]): + assert ( + train_df.num_columns == 2 + ), 'Only columns ["a", "c"] must be present in train DataFrame.' + assert ( + test_df.num_columns == 2 + ), 'Only column ["a", "c"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["a", "c"], "Only column [\"a\", \"c\"] must be present in train DataFrame." - assert test_schema.names == ["a", "c"], "Only column [\"a\", \"c\"] must be present in test DataFrame." + assert train_schema.names == [ + "a", + "c", + ], 'Only column ["a", "c"] must be present in train DataFrame.' + assert test_schema.names == [ + "a", + "c", + ], 'Only column ["a", "c"] must be present in test DataFrame.' def test_cv_null(): @@ -113,61 +175,102 @@ def test_cv_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan non_null = df_null.dropna() cv = pbn.CrossValidation(df_null) for (train_df, test_df), (train_indices, test_indices) in zip(cv, cv.indices()): - assert non_null.shape[0] == (train_df.num_rows + test_df.num_rows), "CV did not remove null instances correctly." + assert non_null.shape[0] == ( + train_df.num_rows + test_df.num_rows + ), "CV did not remove null instances correctly." nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) - actual_combination = np.sort(np.setdiff1d(np.arange(SIZE), np.asarray(list(set(list(a_null) + list(b_null) + list(c_null) + list(d_null)))))) - - assert np.all(np.sort(combination) == actual_combination), "Not all the examples are included in the cross validation." - assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" + actual_combination = np.sort( + np.setdiff1d( + np.arange(SIZE), + np.asarray( + list(set(list(a_null) + list(b_null) + list(c_null) + list(d_null))) + ), + ) + ) + + assert np.all( + np.sort(combination) == actual_combination + ), "Not all the examples are included in the cross validation." + assert np.all( + train_df.to_pandas().to_numpy() == df.iloc[train_indices, :].to_numpy() + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + assert np.all( + test_df.to_pandas().to_numpy() == df.iloc[test_indices, :].to_numpy() + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" cv_include_null = pbn.CrossValidation(df_null, include_null=True) - for (train_df, test_df), (train_indices, test_indices) in zip(cv_include_null, cv_include_null.indices()): - assert (train_df.num_rows + test_df.num_rows) == SIZE, "CV did not remove null instances correctly." + for (train_df, test_df), (train_indices, test_indices) in zip( + cv_include_null, cv_include_null.indices() + ): + assert ( + train_df.num_rows + test_df.num_rows + ) == SIZE, "CV did not remove null instances correctly." nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) train_df_mat = train_df.to_pandas().to_numpy() - train_indices_mat = df.iloc[train_indices,:].to_numpy() + train_indices_mat = df.iloc[train_indices, :].to_numpy() test_df_mat = test_df.to_pandas().to_numpy() - test_indices_mat = df.iloc[test_indices,:].to_numpy() - - assert np.all(np.sort(combination) == np.arange(SIZE)), "Not all the examples are included in the cross validation." - assert np.all(np.isnan(train_df_mat) == np.isnan(train_indices_mat)), \ - "The null values are wrongly specified in the train DataFrame." - - assert np.all(train_df_mat[~np.isnan(train_df_mat)] == train_indices_mat[~np.isnan(train_df_mat)]), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - - assert np.all(np.isnan(test_df_mat) == np.isnan(test_indices_mat)), \ - "The null values are wrongly specified in the test DataFrame." - assert np.all(test_df_mat[~np.isnan(test_df_mat)] == test_indices_mat[~np.isnan(test_df_mat)]), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" \ No newline at end of file + test_indices_mat = df.iloc[test_indices, :].to_numpy() + + assert np.all( + np.sort(combination) == np.arange(SIZE) + ), "Not all the examples are included in the cross validation." + assert np.all( + np.isnan(train_df_mat) == np.isnan(train_indices_mat) + ), "The null values are wrongly specified in the train DataFrame." + + assert np.all( + train_df_mat[~np.isnan(train_df_mat)] + == train_indices_mat[~np.isnan(train_df_mat)] + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + + assert np.all( + np.isnan(test_df_mat) == np.isnan(test_indices_mat) + ), "The null values are wrongly specified in the test DataFrame." + assert np.all( + test_df_mat[~np.isnan(test_df_mat)] + == test_indices_mat[~np.isnan(test_df_mat)] + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 54bb6ad5..46a10fb6 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -8,35 +8,53 @@ df = util_test.generate_normal_data(SIZE) + def test_holdout_disjoint(): hold = pbn.HoldOut(df) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" + assert ( + train_df.num_rows + test_df.num_rows + ) == SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * df.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * df.shape[0]), "Test DataFrame do not have the expected number of instances" + assert train_df.num_rows == round( + (1 - 0.2) * df.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * df.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert df.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." - + assert ( + df.sort_values("a", axis=0) + .reset_index(drop=True) + .equals(combination.sort_values("a", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." + hold = pbn.HoldOut(df, test_ratio=0.3) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" + assert ( + train_df.num_rows + test_df.num_rows + ) == SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.3) * df.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.3 * df.shape[0]), "Test DataFrame do not have the expected number of instances" + assert train_df.num_rows == round( + (1 - 0.3) * df.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.3 * df.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert df.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." + assert ( + df.sort_values("a", axis=0) + .reset_index(drop=True) + .equals(combination.sort_values("a", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." + def test_holdout_seed(): hold = pbn.HoldOut(df, seed=0) @@ -45,14 +63,23 @@ def test_holdout_seed(): train_df, test_df = hold.training_data(), hold.test_data() train_df2, test_df2 = hold2.training_data(), hold2.test_data() - assert train_df.equals(train_df2), "Train CV DataFrames with the same seed are not equal." - assert test_df.equals(test_df2), "Test CV DataFrames with the same seed are not equal." + assert train_df.equals( + train_df2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_df.equals( + test_df2 + ), "Test CV DataFrames with the same seed are not equal." hold3 = pbn.HoldOut(df, seed=1) train_df3, test_df3 = hold3.training_data(), hold3.test_data() - assert not train_df.equals(train_df3), "Train CV DataFrames with different seeds return the same result." - assert not test_df.equals(test_df3), "Test CV DataFrames with different seeds return the same result." + assert not train_df.equals( + train_df3 + ), "Train CV DataFrames with different seeds return the same result." + assert not test_df.equals( + test_df3 + ), "Test CV DataFrames with different seeds return the same result." + def test_holdout_null(): np.random.seed(0) @@ -62,36 +89,50 @@ def test_holdout_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan non_null = df_null.dropna() hold = pbn.HoldOut(df_null) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == non_null.shape[0], "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * non_null.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * non_null.shape[0]), "Test DataFrame do not have the expected number of instances" + assert (train_df.num_rows + test_df.num_rows) == non_null.shape[ + 0 + ], "HoldOut do not have the expected number of rows" + assert train_df.num_rows == round( + (1 - 0.2) * non_null.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * non_null.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert combination.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(non_null.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." + assert ( + combination.sort_values("a", axis=0) + .reset_index(drop=True) + .equals(non_null.sort_values("a", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." hold_null = pbn.HoldOut(df_null, include_null=True) train_df, test_df = hold_null.training_data(), hold_null.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * SIZE), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * SIZE), "Test DataFrame do not have the expected number of instances" + assert ( + train_df.num_rows + test_df.num_rows + ) == SIZE, "HoldOut do not have the expected number of rows" + assert train_df.num_rows == round( + (1 - 0.2) * SIZE + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * SIZE + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert combination.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)\ - .equals(df.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." - - \ No newline at end of file + assert ( + combination.sort_values(["a", "b", "c", "d"], axis=0) + .reset_index(drop=True) + .equals(df.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index 3e605742..f4d3c060 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -15,19 +15,32 @@ TEST_SIZE = 50 df = util_test.generate_normal_data(SIZE, seed=0) df_small = util_test.generate_normal_data(SMALL_SIZE, seed=0) -df_float = df.astype('float32') -df_small_float = df_small.astype('float32') +df_float = df.astype("float32") +df_small_float = df_small.astype("float32") + def test_variable(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.CKDE(variable, evidence) assert cpd.variable() == variable + def test_evidence(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.CKDE(variable, evidence) assert cpd.evidence() == evidence + def test_kde_data_type(): k = pbn.CKDE("a", []) @@ -40,18 +53,27 @@ def test_kde_data_type(): k.fit(df_float) assert k.data_type() == pa.float32() + def test_ckde_kde_joint(): def _test_ckde_kde_joint_iter(variable, evidence, _df): cpd = pbn.CKDE(variable, evidence) cpd.fit(_df) kde_joint = cpd.kde_joint kde_joint().bandwidth = np.eye(len(evidence) + 1) - assert np.all(cpd.kde_joint().bandwidth == np.eye(len(evidence) + 1)), "kde_joint do not return a reference to the KDE joint, but a copy." - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + assert np.all( + cpd.kde_joint().bandwidth == np.eye(len(evidence) + 1) + ), "kde_joint do not return a reference to the KDE joint, but a copy." + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_kde_joint_iter(variable, evidence, df) _test_ckde_kde_joint_iter(variable, evidence, df_float) + def test_ckde_kde_marg(): def _test_ckde_kde_marg_iter(variable, evidence, _df): cpd = pbn.CKDE(variable, evidence) @@ -61,20 +83,31 @@ def _test_ckde_kde_marg_iter(variable, evidence, _df): if evidence: assert kde_marg().fitted() kde_marg().bandwidth = np.eye(len(evidence)) - assert np.all(cpd.kde_marg().bandwidth == np.eye(len(evidence))), "kde_marg do not return a reference to the KDE joint, but a copy." + assert np.all( + cpd.kde_marg().bandwidth == np.eye(len(evidence)) + ), "kde_marg do not return a reference to the KDE joint, but a copy." else: # kde_marg contains garbage if there is no evidence pass - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_kde_marg_iter(variable, evidence, df) _test_ckde_kde_marg_iter(variable, evidence, df_float) + def test_ckde_fit(): def _test_ckde_fit(variables, _df, instances): npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) cpd = pbn.CKDE(variable, evidence) assert not cpd.fitted() @@ -83,19 +116,27 @@ def _test_ckde_fit(variables, _df, instances): kde_joint = cpd.kde_joint assert np.all(np.isclose(kde_joint().bandwidth, scipy_kde.covariance)) - + if evidence: kde_marg = cpd.kde_marg - assert np.all(np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:,1:])) - + assert np.all( + np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:, 1:]) + ) + assert cpd.num_instances() == instances - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: variables = [variable] + evidence - for instances in [50, 1000, 10000]: + for instances in [50, 1000, 10000]: _test_ckde_fit(variables, df, instances) _test_ckde_fit(variables, df_float, instances) + def test_ckde_fit_null(): def _test_ckde_fit_null(variable, evidence, variables, _df, instances): cpd = pbn.CKDE(variable, evidence) @@ -104,18 +145,23 @@ def _test_ckde_fit_null(variable, evidence, variables, _df, instances): assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) - npdata_no_null = npdata_instances[~nan_rows,:] - scipy_kde = gaussian_kde(npdata_no_null.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + npdata_no_null = npdata_instances[~nan_rows, :] + scipy_kde = gaussian_kde( + npdata_no_null.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) kde_joint = cpd.kde_joint assert np.all(np.isclose(kde_joint().bandwidth, scipy_kde.covariance)) - + if evidence: kde_marg = cpd.kde_marg - assert np.all(np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:,1:])) + assert np.all( + np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:, 1:]) + ) assert cpd.num_instances() == scipy_kde.n @@ -126,23 +172,29 @@ def _test_ckde_fit_null(variable, evidence, variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: variables = [variable] + evidence - for instances in [50, 1000, 10000]: + for instances in [50, 1000, 10000]: _test_ckde_fit_null(variable, evidence, variables, df, instances) _test_ckde_fit_null(variable, evidence, variables, df_float, instances) + def train_scipy_ckde(data, variable, evidence): variables = [variable] + evidence npdata_joint = data.loc[:, variables].to_numpy() @@ -150,15 +202,20 @@ def train_scipy_ckde(data, variable, evidence): nan_rows = np.any(np.isnan(npdata_joint), axis=1) - scipy_kde_joint = gaussian_kde(npdata_joint[~nan_rows,:].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde_joint = gaussian_kde( + npdata_joint[~nan_rows, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor(), + ) if evidence: - scipy_kde_marg = gaussian_kde(npdata_marg[~nan_rows,:].T, bw_method=scipy_kde_joint.covariance_factor()) + scipy_kde_marg = gaussian_kde( + npdata_marg[~nan_rows, :].T, bw_method=scipy_kde_joint.covariance_factor() + ) else: scipy_kde_marg = None return scipy_kde_joint, scipy_kde_marg + def scipy_ckde_logpdf(test_data, joint_kde, marg_kde, variable, evidence): variables = [variable] + evidence test_data_joint = test_data.loc[:, variables].to_numpy() @@ -172,12 +229,15 @@ def scipy_ckde_logpdf(test_data, joint_kde, marg_kde, variable, evidence): result = np.full(test_data.shape[0], np.nan, dtype=np.float64) if evidence: - result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows,:].T) - marg_kde.logpdf(test_data_marg[~nan_rows,:].T) + result[~nan_rows] = joint_kde.logpdf( + test_data_joint[~nan_rows, :].T + ) - marg_kde.logpdf(test_data_marg[~nan_rows, :].T) else: - result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows,:].T) + result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows, :].T) return result + def scipy_ckde_cdf(test_data, joint_kde, marg_kde, variable, evidence): variables = [variable] + evidence test_data_joint = test_data.loc[:, variables].to_numpy() @@ -196,28 +256,46 @@ def scipy_ckde_cdf(test_data, joint_kde, marg_kde, variable, evidence): if evidence: bandwidth = joint_kde.covariance - cond_var = bandwidth[0,0] - bandwidth[0, 1:].dot(np.linalg.inv(bandwidth[1:, 1:])).dot(bandwidth[1:, 0]) + cond_var = bandwidth[0, 0] - bandwidth[0, 1:].dot( + np.linalg.inv(bandwidth[1:, 1:]) + ).dot(bandwidth[1:, 0]) for test_index in np.where(~np.any(np.isnan(test_data_joint), axis=1))[0]: - w = mvn.logpdf(marg_kde.dataset.T, mean=test_data_marg[test_index,:], cov=marg_kde.covariance) + w = mvn.logpdf( + marg_kde.dataset.T, + mean=test_data_marg[test_index, :], + cov=marg_kde.covariance, + ) w = np.exp(w) total_w[:, test_index] = w - evidence_diff = test_data_marg[test_index,:] - joint_kde.dataset[1:,:].T - cond_mean = joint_kde.dataset[0,:] + bandwidth[0,1:].dot(np.linalg.inv(bandwidth[1:,1:])).dot(evidence_diff.T) + evidence_diff = test_data_marg[test_index, :] - joint_kde.dataset[1:, :].T + cond_mean = joint_kde.dataset[0, :] + bandwidth[0, 1:].dot( + np.linalg.inv(bandwidth[1:, 1:]) + ).dot(evidence_diff.T) conditional_mean[:, test_index] = cond_mean - total_cdf[:, test_index] = norm.cdf(test_data_joint[test_index,0], cond_mean, np.sqrt(cond_var)) + total_cdf[:, test_index] = norm.cdf( + test_data_joint[test_index, 0], cond_mean, np.sqrt(cond_var) + ) - result[test_index] = np.dot(w, norm.cdf(test_data_joint[test_index,0], cond_mean, np.sqrt(cond_var))) + result[test_index] = np.dot( + w, + norm.cdf(test_data_joint[test_index, 0], cond_mean, np.sqrt(cond_var)), + ) result /= np.sum(total_w, axis=0) else: - cdf = norm.cdf(test_data_joint[~nan_rows], joint_kde.dataset, np.sqrt(joint_kde.covariance[0,0])) + cdf = norm.cdf( + test_data_joint[~nan_rows], + joint_kde.dataset, + np.sqrt(joint_kde.covariance[0, 0]), + ) result[~nan_rows] = np.sum((1 / joint_kde.n) * cdf, axis=1) return result + def test_ckde_logl(): def _test_ckde_logl(variable, evidence, _df, _test_df): cpd = pbn.CKDE(variable, evidence) @@ -225,33 +303,45 @@ def _test_ckde_logl(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) logl = cpd.logl(_test_df) - scipy = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=0.0005)) else: assert np.all(np.isclose(logl, scipy)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_logl(variable, evidence, df, test_df) _test_ckde_logl(variable, evidence, df_small, test_df) _test_ckde_logl(variable, evidence, df_float, test_df_float) _test_ckde_logl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_ckde_logl_null(): def _test_ckde_logl_null(variable, evidence, _df, _test_df): @@ -261,7 +351,9 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) logl = cpd.logl(_test_df) - scipy = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=0.0005, equal_nan=True)) @@ -269,7 +361,7 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): assert np.all(np.isclose(logl, scipy, equal_nan=True)) test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -278,40 +370,50 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_logl_null(variable, evidence, df, df_null) _test_ckde_logl_null(variable, evidence, df_small, df_null) _test_ckde_logl_null(variable, evidence, df_float, df_null_float) _test_ckde_logl_null(variable, evidence, df_small_float, df_null_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) ll = cpd.logl(df_null) ll2 = cpd2.logl(df_null) - assert np.all(np.isclose(ll, ll2, equal_nan=True)), "Order of evidence changes the position of nan values." + assert np.all( + np.isclose(ll, ll2, equal_nan=True) + ), "Order of evidence changes the position of nan values." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) ll = cpd.logl(df_null_float) ll2 = cpd2.logl(df_null_float) - assert np.all(np.isclose(ll, ll2, equal_nan=True)), "Order of evidence changes the position of nan values." + assert np.all( + np.isclose(ll, ll2, equal_nan=True) + ), "Order of evidence changes the position of nan values." + def test_ckde_slogl(): def _test_ckde_slogl(variable, evidence, _df, _test_df): @@ -319,34 +421,48 @@ def _test_ckde_slogl(variable, evidence, _df, _test_df): cpd.fit(_df) scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) - scipy_logl = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy_logl = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): # Allow an error of 0.0005 for each training instance. - assert np.isclose(cpd.slogl(_test_df), scipy_logl.sum(), atol=0.0005*_df.shape[0]) + assert np.isclose( + cpd.slogl(_test_df), scipy_logl.sum(), atol=0.0005 * _df.shape[0] + ) else: assert np.isclose(cpd.slogl(_test_df), scipy_logl.sum()) test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + test_df_float = test_df.astype("float32") + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_slogl(variable, evidence, df, test_df) _test_ckde_slogl(variable, evidence, df_small, test_df) _test_ckde_slogl(variable, evidence, df_float, test_df_float) _test_ckde_slogl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) + ), "Order of evidence changes slogl() result." + def test_ckde_slogl_null(): def _test_ckde_slogl_null(variable, evidence, _df, _test_df): @@ -354,17 +470,20 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): cpd.fit(_df) scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) - scipy_logl = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy_logl = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): # Allow an error of 0.0005 for each training instance. - assert np.isclose(cpd.slogl(_test_df), np.nansum(scipy_logl), atol=0.0005*_df.shape[0]) + assert np.isclose( + cpd.slogl(_test_df), np.nansum(scipy_logl), atol=0.0005 * _df.shape[0] + ) else: assert np.isclose(cpd.slogl(_test_df), np.nansum(scipy_logl)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -373,35 +492,44 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_slogl_null(variable, evidence, df, df_null) _test_ckde_slogl_null(variable, evidence, df_small, df_null) _test_ckde_slogl_null(variable, evidence, df_float, df_null_float) _test_ckde_slogl_null(variable, evidence, df_small_float, df_null_float) - - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) + ), "Order of evidence changes slogl() result." + def test_ckde_cdf(): def _test_ckde_cdf(variable, evidence, _df, _test_df): @@ -410,33 +538,45 @@ def _test_ckde_cdf(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) cdf = cpd.cdf(_test_df) - scipy = scipy_ckde_cdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_cdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(cdf, scipy, atol=0.0005)) else: assert np.all(np.isclose(cdf, scipy)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_cdf(variable, evidence, df, test_df) _test_ckde_cdf(variable, evidence, df_small, test_df) _test_ckde_cdf(variable, evidence, df_float, test_df_float) _test_ckde_cdf(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.cdf(test_df_float), cpd2.cdf(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.cdf(test_df_float), cpd2.cdf(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_ckde_cdf_null(): def _test_ckde_cdf_null(variable, evidence, _df, _test_df): @@ -446,16 +586,17 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) cdf = cpd.cdf(_test_df) - scipy = scipy_ckde_cdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_cdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(cdf, scipy, atol=0.0005, equal_nan=True)) else: assert np.all(np.isclose(cdf, scipy, equal_nan=True)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -464,91 +605,105 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: _test_ckde_cdf_null(variable, evidence, df, df_null) _test_ckde_cdf_null(variable, evidence, df_small, df_null) _test_ckde_cdf_null(variable, evidence, df_float, df_null_float) _test_ckde_cdf_null(variable, evidence, df_small_float, df_null_float) - - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True)), "Order of evidence changes cdf() result." + assert np.all( + np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True) + ), "Order of evidence changes cdf() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("d", ["a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("d", ["c", "b", "a"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.cdf(df_null_float), - cpd2.cdf(df_null_float), - atol=0.0005, equal_nan=True)), "Order of evidence changes cdf() result." + assert np.all( + np.isclose( + cpd.cdf(df_null_float), cpd2.cdf(df_null_float), atol=0.0005, equal_nan=True + ) + ), "Order of evidence changes cdf() result." + def test_ckde_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.CKDE('a', []) + cpd = pbn.CKDE("a", []) cpd.fit(df) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('b', ['a']) + + cpd = pbn.CKDE("b", ["a"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('c', ['a', 'b']) + + cpd = pbn.CKDE("c", ["a", "b"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0), - 'b': np.full((SAMPLE_SIZE,), 7.45)}) + sampling_df = pd.DataFrame( + {"a": np.full((SAMPLE_SIZE,), 3.0), "b": np.full((SAMPLE_SIZE,), 7.45)} + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE('a', []) + cpd = pbn.CKDE("a", []) cpd.fit(df_float) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('b', ['a']) + + cpd = pbn.CKDE("b", ["a"]) cpd.fit(df_float) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) + sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('c', ['a', 'b']) + + cpd = pbn.CKDE("c", ["a", "b"]) cpd.fit(df_float) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), - 'b': np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32)}) + sampling_df = pd.DataFrame( + { + "a": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), + "b": np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32), + } + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float32() - assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE \ No newline at end of file + assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index f78ba2d0..6e60c813 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -9,10 +9,11 @@ SIZE = 500 df = util_test.generate_normal_data(SIZE, seed=0) -df_float = df.astype('float32') +df_float = df.astype("float32") + def test_check_type(): - cpd = pbn.KDE(['a']) + cpd = pbn.KDE(["a"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -29,37 +30,49 @@ def test_check_type(): cpd.slogl(df) assert "Data type of training and test datasets is different." in str(ex.value) + def test_kde_variables(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: cpd = pbn.KDE(variables) assert cpd.variables() == variables + def test_kde_bandwidth(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 1000, 10000]: npdata = df.loc[:, variables].to_numpy() # Test normal reference rule - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) cpd = pbn.KDE(variables) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with normal reference rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with normal reference rule." scipy_kde = gaussian_kde(npdata[:instances, :].T) cpd = pbn.KDE(variables, pbn.ScottsBandwidth()) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with Scott's rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with Scott's rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with Scott's rule." - + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with Scott's rule." - cpd = pbn.KDE(['a']) + cpd = pbn.KDE(["a"]) cpd.fit(df) cpd.bandwidth = [[1]] assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." @@ -68,6 +81,7 @@ def test_kde_bandwidth(): cpd.bandwidth = [[1]] assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." + class UnitaryBandwidth(BandwidthSelector): def __init__(self): BandwidthSelector.__init__(self) @@ -75,6 +89,7 @@ def __init__(self): def bandwidth(self, df, variables): return np.eye(len(variables)) + def test_kde_new_bandwidth(): kde = pbn.KDE(["a"], UnitaryBandwidth()) kde.fit(df) @@ -90,6 +105,7 @@ def test_kde_new_bandwidth(): kde.fit(df_float) assert np.all(kde.bandwidth == np.eye(4)) + def test_kde_data_type(): k = pbn.KDE(["a"]) @@ -107,39 +123,52 @@ def test_kde_fit(): def _test_kde_fit_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) assert scipy_kde.n == cpd.num_instances(), "Wrong number of training instances." assert scipy_kde.d == cpd.num_variables(), "Wrong number of training variables." - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 150, 500]: _test_kde_fit_iter(variables, df, instances) _test_kde_fit_iter(variables, df_float, instances) + def test_kde_fit_null(): def _test_kde_fit_null_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) - npdata_no_null = npdata_instances[~nan_rows,:] - scipy_kde = gaussian_kde(npdata_no_null.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) - - assert scipy_kde.n == cpd.num_instances(), "Wrong number of training instances with null values." - assert scipy_kde.d == cpd.num_variables(), "Wrong number of training variables with null values." - assert np.all(np.isclose(scipy_kde.covariance, cpd.bandwidth)), "Wrong bandwidth with null values." + npdata_no_null = npdata_instances[~nan_rows, :] + scipy_kde = gaussian_kde( + npdata_no_null.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) + + assert ( + scipy_kde.n == cpd.num_instances() + ), "Wrong number of training instances with null values." + assert ( + scipy_kde.d == cpd.num_variables() + ), "Wrong number of training variables with null values." + assert np.all( + np.isclose(scipy_kde.covariance, cpd.bandwidth) + ), "Wrong bandwidth with null values." np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -148,59 +177,68 @@ def _test_kde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 150, 500]: _test_kde_fit_null_iter(variables, df_null, instances) _test_kde_fit_null_iter(variables, df_null_float, instances) + def test_kde_logl(): def _test_kde_logl_iter(variables, _df, _test_df): cpd = pbn.KDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) test_npdata = _test_df.loc[:, variables].to_numpy() logl = cpd.logl(_test_df) scipy = scipy_kde.logpdf(test_npdata.T) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=0.0005)) else: assert np.all(np.isclose(logl, scipy)) test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_kde_logl_iter(variables, df, test_df) _test_kde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float)) + ), "Order of evidence changes logl() result." + def test_kde_logl_null(): def _test_kde_logl_null_iter(variables, _df, _test_df): @@ -208,8 +246,11 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) test_npdata = _test_df.loc[:, variables].to_numpy() @@ -227,7 +268,7 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): TEST_SIZE = 50 test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -236,33 +277,42 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_kde_logl_null_iter(variables, df, df_null) _test_kde_logl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) + ), "Order of evidence changes logl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(df_null_float), cpd2.logl(df_null_float), atol=0.0005, equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose( + cpd.logl(df_null_float), + cpd2.logl(df_null_float), + atol=0.0005, + equal_nan=True, + ) + ), "Order of evidence changes logl() result." + def test_kde_slogl(): def _test_kde_slogl_iter(variables, _df, _test_df): @@ -270,31 +320,39 @@ def _test_kde_slogl_iter(variables, _df, _test_df): cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) test_npdata = _test_df.loc[:, variables].to_numpy() - assert np.all(np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata.T).sum())) - + assert np.all( + np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata.T).sum()) + ) test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_kde_slogl_iter(variables, df, test_df) _test_kde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) + ), "Order of evidence changes slogl() result." def test_kde_slogl_null(): @@ -303,19 +361,26 @@ def _test_kde_slogl_null_iter(variables, _df, _test_df): cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) test_npdata = _test_df.loc[:, variables].to_numpy() nan_rows = np.any(np.isnan(test_npdata), axis=1) - assert np.all(np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata[~nan_rows].T).sum())) + assert np.all( + np.isclose( + cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata[~nan_rows].T).sum() + ) + ) TEST_SIZE = 50 test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -324,30 +389,33 @@ def _test_kde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_kde_slogl_null_iter(variables, df, df_null) _test_kde_slogl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) + ), "Order of evidence changes slogl() result." diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 546e7691..df42820b 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -12,13 +12,25 @@ df = util_test.generate_normal_data(SIZE) + def test_lg_variable(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.variable() == variable + def test_lg_evidence(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.evidence() == evidence @@ -26,26 +38,36 @@ def test_lg_evidence(): def fit_numpy(_df, variable, evidence): df_na = _df.loc[:, [variable] + evidence].dropna() linregress_data = np.column_stack((np.ones(df_na.shape[0]), df_na.loc[:, evidence])) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, df_na.loc[:, variable], rcond=None) - + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, df_na.loc[:, variable], rcond=None + ) + return beta, res / (df_na.count()[variable] - len(evidence) - 1) + def test_lg_data_type(): cpd = pbn.LinearGaussianCPD("a", []) assert cpd.data_type() == pa.float64() + def test_lg_fit(): - for variable, evidence in [("a", []), ("b", ["a"]), ("c", ["a", "b"]), ("d", ["a", "b", "c"])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() cpd.fit(df) assert cpd.fitted() npbeta, npvar = fit_numpy(df, variable, evidence) - + assert np.all(np.isclose(npbeta, cpd.beta)), "Wrong beta vector." assert np.all(np.isclose(npvar, cpd.variance)), "Wrong variance." + def test_lg_fit_null(): np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -54,25 +76,31 @@ def test_lg_fit_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [("a", []), ("b", ["a"]), ("c", ["a", "b"]), ("d", ["a", "b", "c"])]: + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() cpd.fit(df_null) assert cpd.fitted() npbeta, npvar = fit_numpy(df_null, variable, evidence) - + assert np.all(np.isclose(npbeta, cpd.beta)), "Wrong beta vector." assert np.all(np.isclose(npvar, cpd.variance)), "Wrong variance." + def numpy_logpdf(test_df, variable, evidence, beta, variance): npdata = test_df.loc[:, evidence].to_numpy() - means = beta[0] + np.sum(beta[1:]*npdata, axis=1) + means = beta[0] + np.sum(beta[1:] * npdata, axis=1) result = np.empty((test_df.shape[0],)) @@ -81,12 +109,17 @@ def numpy_logpdf(test_df, variable, evidence, beta, variance): isnan_vec[np.isnan(test_df.loc[:, variable].to_numpy())] = True result[isnan_vec] = np.nan - result[~isnan_vec] = norm.logpdf(test_df.loc[:, variable].to_numpy()[~isnan_vec], means[~isnan_vec], np.sqrt(variance)) + result[~isnan_vec] = norm.logpdf( + test_df.loc[:, variable].to_numpy()[~isnan_vec], + means[~isnan_vec], + np.sqrt(variance), + ) return result + def numpy_cdf(test_df, variable, evidence, beta, variance): npdata = test_df.loc[:, evidence].to_numpy() - means = beta[0] + np.sum(beta[1:]*npdata, axis=1) + means = beta[0] + np.sum(beta[1:] * npdata, axis=1) result = np.empty((test_df.shape[0],)) @@ -95,29 +128,51 @@ def numpy_cdf(test_df, variable, evidence, beta, variance): isnan_vec[np.isnan(test_df.loc[:, variable].to_numpy())] = True result[isnan_vec] = np.nan - result[~isnan_vec] = norm.cdf(test_df.loc[:, variable].to_numpy()[~isnan_vec], means[~isnan_vec], np.sqrt(variance)) + result[~isnan_vec] = norm.cdf( + test_df.loc[:, variable].to_numpy()[~isnan_vec], + means[~isnan_vec], + np.sqrt(variance), + ) return result + def test_lg_logl(): test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.logl(test_df), numpy_logpdf(test_df, variable, evidence, beta, variance))),\ - "Wrong logl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.logl(test_df), + numpy_logpdf(test_df, variable, evidence, beta, variance), + ) + ), ( + "Wrong logl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "The order of the evidence changes the logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "The order of the evidence changes the logl() result." + def test_lg_logl_null(): test_df = util_test.generate_normal_data(5000) @@ -129,52 +184,84 @@ def test_lg_logl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose( - cpd.logl(test_df), - numpy_logpdf(test_df, variable, evidence, beta, variance), equal_nan=True)),\ - "Wrong logl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.logl(test_df), + numpy_logpdf(test_df, variable, evidence, beta, variance), + equal_nan=True, + ) + ), ( + "Wrong logl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose( - cpd.logl(test_df), - cpd2.logl(test_df), equal_nan=True)),\ - "The order of the evidence changes the logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df), equal_nan=True) + ), "The order of the evidence changes the logl() result." + def test_lg_slogl(): test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.slogl(test_df), np.sum(numpy_logpdf(test_df, variable, evidence, beta, variance)))),\ - "Wrong slogl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.slogl(test_df), + np.sum(numpy_logpdf(test_df, variable, evidence, beta, variance)), + ) + ), ( + "Wrong slogl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "The order of the evidence changes the slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "The order of the evidence changes the slogl() result." + def test_lg_slogl_null(): test_df = util_test.generate_normal_data(5000) @@ -186,48 +273,82 @@ def test_lg_slogl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.slogl(df_null), np.nansum(numpy_logpdf(df_null, variable, evidence, beta, variance)))),\ - "Wrong slogl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.slogl(df_null), + np.nansum(numpy_logpdf(df_null, variable, evidence, beta, variance)), + ) + ), ( + "Wrong slogl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "The order of the evidence changes the slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "The order of the evidence changes the slogl() result." + def test_lg_cdf(): test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.cdf(test_df), numpy_cdf(test_df, variable, evidence, beta, variance))),\ - "Wrong cdf for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.cdf(test_df), numpy_cdf(test_df, variable, evidence, beta, variance) + ) + ), ( + "Wrong cdf for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df))), "The order of the evidence changes the cdf() result." + assert np.all( + np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df)) + ), "The order of the evidence changes the cdf() result." + def test_lg_cdf_null(): test_df = util_test.generate_normal_data(5000) @@ -239,59 +360,74 @@ def test_lg_cdf_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan + + for variable, evidence in [ + ("a", []), + ("b", ["a"]), + ("c", ["a", "b"]), + ("d", ["a", "b", "c"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose( - cpd.cdf(df_null), - numpy_cdf(df_null, variable, evidence, beta, variance), equal_nan=True)),\ - "Wrong cdf for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.cdf(df_null), + numpy_cdf(df_null, variable, evidence, beta, variance), + equal_nan=True, + ) + ), ( + "Wrong cdf for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) cpd2.fit(df) - assert np.all(np.isclose( - cpd.cdf(df_null), - cpd2.cdf(df_null), equal_nan=True)),\ - "The order of the evidence changes the cdf() result." + assert np.all( + np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True) + ), "The order of the evidence changes the cdf() result." + def test_lg_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.LinearGaussianCPD('a', []) + cpd = pbn.LinearGaussianCPD("a", []) cpd.fit(df) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.LinearGaussianCPD('b', ['a']) + + cpd = pbn.LinearGaussianCPD("b", ["a"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.LinearGaussianCPD('c', ['a', 'b']) + + cpd = pbn.LinearGaussianCPD("c", ["a", "b"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0), - 'b': np.full((SAMPLE_SIZE,), 7.45)}) + sampling_df = pd.DataFrame( + {"a": np.full((SAMPLE_SIZE,), 3.0), "b": np.full((SAMPLE_SIZE,), 7.45)} + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() - assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE \ No newline at end of file + assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 17625cd9..d76ff970 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -10,10 +10,11 @@ SIZE = 500 df = util_test.generate_normal_data(SIZE, seed=0) -df_float = df.astype('float32') +df_float = df.astype("float32") + def test_check_type(): - cpd = pbn.ProductKDE(['a']) + cpd = pbn.ProductKDE(["a"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -30,11 +31,13 @@ def test_check_type(): cpd.slogl(df) assert "Data type of training and test datasets is different." in str(ex.value) + def test_productkde_variables(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: cpd = pbn.ProductKDE(variables) assert cpd.variables() == variables + def py_nr_bandwidth(df, variables): cov = df[variables].cov().to_numpy() delta = np.linalg.inv(np.diag(np.diag(cov))).dot(cov) @@ -42,9 +45,15 @@ def py_nr_bandwidth(df, variables): N = df.shape[0] d = len(variables) - k = 4*d*np.sqrt(np.linalg.det(delta))/ (2*(delta_inv.dot(delta_inv)).trace() + delta_inv.trace()**2) + k = ( + 4 + * d + * np.sqrt(np.linalg.det(delta)) + / (2 * (delta_inv.dot(delta_inv)).trace() + delta_inv.trace() ** 2) + ) return np.power(k / N, 2 / (d + 4)) * np.diag(cov) + def py_scott_bandwidth(df, variables): var = df[variables].var().to_numpy() N = df.shape[0] @@ -52,26 +61,42 @@ def py_scott_bandwidth(df, variables): return np.power(N, -2 / (d + 4)) * var + def test_productkde_bandwidth(): # for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: - for variables in [['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 150, 500]: cpd = pbn.ProductKDE(variables) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables)) + ), "Wrong bandwidth computed with normal reference rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(df[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth computed with normal reference rule." cpd = pbn.ProductKDE(variables, pbn.ScottsBandwidth()) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with Scott's rule." + assert np.all( + np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables)) + ), "Wrong bandwidth computed with Scott's rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with Scott's rule." - - - cpd = pbn.ProductKDE(['a']) + assert np.all( + np.isclose( + cpd.bandwidth, + py_scott_bandwidth(df[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth computed with Scott's rule." + + cpd = pbn.ProductKDE(["a"]) cpd.fit(df) cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." @@ -80,28 +105,31 @@ def test_productkde_bandwidth(): cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." + class UnitaryBandwidth(BandwidthSelector): def __init__(self): BandwidthSelector.__init__(self) def diag_bandwidth(self, df, variables): return np.ones((len(variables),)) - + + def test_productkde_new_bandwidth(): kde = pbn.ProductKDE(["a"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.ones((1,)) - + kde.fit(df_float) assert kde.bandwidth == np.ones((1,)) kde = pbn.ProductKDE(["a", "b", "c", "d"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.ones((4,))) - + kde.fit(df_float) assert np.all(kde.bandwidth == np.ones((4,))) + def test_productkde_data_type(): k = pbn.ProductKDE(["a"]) @@ -114,44 +142,75 @@ def test_productkde_data_type(): k.fit(df_float) assert k.data_type() == pa.float32() + def test_productkde_fit(): def _test_productkde_fit_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() assert instances == cpd.num_instances(), "Wrong number of training instances." - assert len(variables) == cpd.num_variables(), "Wrong number of training variables." - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables), atol=0.0005)), "Wrong bandwidth." + assert ( + len(variables) == cpd.num_variables() + ), "Wrong number of training variables." + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth." else: - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables))), "Wrong bandwidth." + assert np.all( + np.isclose( + cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables) + ) + ), "Wrong bandwidth." - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 150, 500]: _test_productkde_fit_iter(variables, df, instances) _test_productkde_fit_iter(variables, df_float, instances) + def test_productkde_fit_null(): def _test_productkde_fit_null_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) nonnan_indices = np.where(~nan_rows)[0] - assert (~nan_rows).sum() == cpd.num_instances(), "Wrong number of training instances with null values." - assert len(variables) == cpd.num_variables(), "Wrong number of training variables with null values." - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables), atol=0.0005)), "Wrong bandwidth with null values." + assert ( + ~nan_rows + ).sum() == cpd.num_instances(), ( + "Wrong number of training instances with null values." + ) + assert ( + len(variables) == cpd.num_variables() + ), "Wrong number of training variables with null values." + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[nonnan_indices, :], variables), + atol=0.0005, + ) + ), "Wrong bandwidth with null values." else: - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables))), "Wrong bandwidth with null values." + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[nonnan_indices, :], variables), + ) + ), "Wrong bandwidth with null values." np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -160,18 +219,18 @@ def _test_productkde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: for instances in [50, 150, 500]: _test_productkde_fit_null_iter(variables, df_null, instances) _test_productkde_fit_null_iter(variables, df_null_float, instances) @@ -188,11 +247,11 @@ def factor_product_kernel(train_data): d = train_data.shape[1] num_factor = 4 * d * np.sqrt(np.linalg.det(delta)) - denom_factor = (2 * np.trace(np.dot(delta_inv, delta_inv)) + np.trace(delta_inv)**2) + denom_factor = 2 * np.trace(np.dot(delta_inv, delta_inv)) + np.trace(delta_inv) ** 2 k = num_factor / denom_factor - return (k / N)**(1. / (d + 4.)) + return (k / N) ** (1.0 / (d + 4.0)) def test_productkde_logl(): @@ -206,37 +265,46 @@ def _test_productkde_logl_iter(variables, _df, _test_df): factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() scipy = final_scipy_kde.logpdf(test_npdata.T) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=5e-3)) else: assert np.all(np.isclose(logl, scipy)) test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_productkde_logl_iter(variables, df, test_df) _test_productkde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_productkde_logl_null(): def _test_productkde_logl_null_iter(variables, _df, _test_df): @@ -248,10 +316,14 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): npdata = _df.loc[:, variables].to_numpy() factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() @@ -267,7 +339,7 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): TEST_SIZE = 50 test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -276,32 +348,42 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_productkde_logl_null_iter(variables, df, df_null) _test_productkde_logl_null_iter(variables, df_float, df_null_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) + ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(df_null_float), cpd2.logl(df_null_float), atol=0.0005, equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose( + cpd.logl(df_null_float), + cpd2.logl(df_null_float), + atol=0.0005, + equal_nan=True, + ) + ), "Order of evidence changes logl() result." + def test_productkde_slogl(): def _test_productkde_slogl_iter(variables, _df, _test_df): @@ -309,39 +391,56 @@ def _test_productkde_slogl_iter(variables, _df, _test_df): cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - + factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum(), - atol=5e-3*test_npdata.shape[0])) + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.slogl(_test_df), + final_scipy_kde.logpdf(test_npdata.T).sum(), + atol=5e-3 * test_npdata.shape[0], + ) + ) else: - assert np.all(np.isclose(cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum())) + assert np.all( + np.isclose( + cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum() + ) + ) test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_productkde_slogl_iter(variables, df, test_df) _test_productkde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005)), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005) + ), "Order of evidence changes slogl() result." def test_productkde_slogl_null(): @@ -350,28 +449,40 @@ def _test_productkde_slogl_null_iter(variables, _df, _test_df): cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - + factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() nan_rows = np.any(np.isnan(test_npdata), axis=1) if npdata.dtype == "float32": - assert np.all(np.isclose(cpd.slogl(_test_df), - np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), - atol=5e-3*test_npdata.shape[0])) + assert np.all( + np.isclose( + cpd.slogl(_test_df), + np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), + atol=5e-3 * test_npdata.shape[0], + ) + ) else: - assert np.all(np.isclose(cpd.slogl(_test_df), - np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)))) + assert np.all( + np.isclose( + cpd.slogl(_test_df), + np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), + ) + ) TEST_SIZE = 50 test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -380,30 +491,33 @@ def _test_productkde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: _test_productkde_slogl_null_iter(variables, df, df_null) _test_productkde_slogl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["d", "a", "b", "c"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float), atol=0.0005)), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float), atol=0.0005) + ), "Order of evidence changes slogl() result." diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 134575ee..0346d008 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -7,31 +7,45 @@ df = util_test.generate_discrete_data_dependent(10000) + def test_data_type(): a = pbn.DiscreteFactor("A", []) with pytest.raises(ValueError) as ex: a.data_type() "DiscreteFactor factor not fitted." in str(ex.value) - + categories = np.asarray(["a1", "a2"]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) categories = np.asarray(["a" + str(i) for i in range(1, 129)]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) categories = np.asarray(["a" + str(i) for i in range(1, 130)]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int16(), pa.string()) + def test_fit(): # a = DiscreteFactor('C', ['A', 'B']) - a = pbn.DiscreteFactor('C', []) + a = pbn.DiscreteFactor("C", []) a.fit(df) diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index 0a551cb1..47e7be73 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -2,6 +2,7 @@ import pybnesian as pbn from pybnesian import FactorType, Factor + def test_factor_type(): lg1 = pbn.LinearGaussianCPD("a", []) lg2 = pbn.LinearGaussianCPD("b", ["a"]) @@ -34,6 +35,7 @@ def test_factor_type(): assert lg1.type() != d1.type() assert c1.type() != d1.type() + def test_new_factor_type(): class A(FactorType): def __init__(self): @@ -61,6 +63,7 @@ def __init__(self): assert a1 != b1 + def test_factor_defined_factor_type(): class F_type(FactorType): def __init__(self): @@ -89,12 +92,14 @@ def type(self): dummy_network = pbn.GaussianNetwork(["a", "b", "c", "d"]) with pytest.raises(RuntimeError) as ex: f4 = f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) - assert 'Tried to call pure virtual function "FactorType::new_factor"' in str(ex.value) + assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( + ex.value + ) class G_type(FactorType): def __init__(self): FactorType.__init__(self) - + def new_factor(self, model, variable, evidence): return G(variable, evidence) @@ -124,4 +129,4 @@ def type(self): assert g1.type() == g4.type() assert g4.variable() == "d" - assert g4.evidence() == ["a", "b", "c"] \ No newline at end of file + assert g4.evidence() == ["a", "b", "c"] diff --git a/tests/helpers/util_test.py b/tests/helpers/util_test.py index 4f523fdf..f368bee5 100644 --- a/tests/helpers/util_test.py +++ b/tests/helpers/util_test.py @@ -6,17 +6,19 @@ def generate_normal_data(size, seed=0): np.random.seed(seed) a_array = np.random.normal(3, 0.5, size=size) - b_array = 2.5 + 1.65*a_array + np.random.normal(0, 2, size=size) - c_array = -4.2 - 1.2*a_array + 3.2*b_array + np.random.normal(0, 0.75, size=size) - d_array = 1.5 - 0.9*a_array + 5.6*b_array + 0.3 * c_array + np.random.normal(0, 0.5, size=size) + b_array = 2.5 + 1.65 * a_array + np.random.normal(0, 2, size=size) + c_array = ( + -4.2 - 1.2 * a_array + 3.2 * b_array + np.random.normal(0, 0.75, size=size) + ) + d_array = ( + 1.5 + - 0.9 * a_array + + 5.6 * b_array + + 0.3 * c_array + + np.random.normal(0, 0.5, size=size) + ) - - return pd.DataFrame({ - 'a': a_array, - 'b': b_array, - 'c': c_array, - 'd': d_array - }) + return pd.DataFrame({"a": a_array, "b": b_array, "c": c_array, "d": d_array}) def generate_normal_data_indep(size, seed=0): @@ -24,76 +26,94 @@ def generate_normal_data_indep(size, seed=0): a_array = np.random.normal(3, 0.5, size=size) b_array = np.random.normal(2.5, 2, size=size) - c_array = -4.2 - 1.2*a_array + 3.2*b_array + np.random.normal(0, 0.75, size=size) + c_array = ( + -4.2 - 1.2 * a_array + 3.2 * b_array + np.random.normal(0, 0.75, size=size) + ) d_array = 1.5 - 0.3 * c_array + np.random.normal(0, 0.5, size=size) - - return pd.DataFrame({ - 'a': a_array, - 'b': b_array, - 'c': c_array, - 'd': d_array - }) + return pd.DataFrame({"a": a_array, "b": b_array, "c": c_array, "d": d_array}) def generate_discrete_data_uniform(size, seed=0): np.random.seed(seed) - a_dict = np.asarray(['a1', 'a2']) - b_dict = np.asarray(['b1', 'b2', 'b3']) - c_dict = np.asarray(['c1', 'c2']) - d_dict = np.asarray(['d1', 'd2', 'd3', 'd4']) + a_dict = np.asarray(["a1", "a2"]) + b_dict = np.asarray(["b1", "b2", "b3"]) + c_dict = np.asarray(["c1", "c2"]) + d_dict = np.asarray(["d1", "d2", "d3", "d4"]) - return pd.DataFrame({'A': a_dict[np.random.randint(0, a_dict.size, size=size)], - 'B': b_dict[np.random.randint(0, b_dict.size, size=size)], - 'C': c_dict[np.random.randint(0, c_dict.size, size=size)], - 'D': d_dict[np.random.randint(0, d_dict.size, size=size)] - }, dtype='category') + return pd.DataFrame( + { + "A": a_dict[np.random.randint(0, a_dict.size, size=size)], + "B": b_dict[np.random.randint(0, b_dict.size, size=size)], + "C": c_dict[np.random.randint(0, c_dict.size, size=size)], + "D": d_dict[np.random.randint(0, d_dict.size, size=size)], + }, + dtype="category", + ) def generate_discrete_data_dependent(size, seed=0): np.random.seed(seed) - a_dict = np.asarray(['a1', 'a2']) - b_dict = np.asarray(['b1', 'b2', 'b3']) - c_dict = np.asarray(['c1', 'c2']) - d_dict = np.asarray(['d1', 'd2', 'd3', 'd4']) + a_dict = np.asarray(["a1", "a2"]) + b_dict = np.asarray(["b1", "b2", "b3"]) + c_dict = np.asarray(["c1", "c2"]) + d_dict = np.asarray(["d1", "d2", "d3", "d4"]) a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] b_values = np.empty_like(a_values) c_values = np.empty_like(a_values) d_values = np.empty_like(a_values) - a1_indices = a_values == 'a1' - - b_values[a1_indices] = b_dict[np.random.choice(b_dict.size, np.sum(a1_indices), p=[0.33, 0.33, 0.34])] - b_values[~a1_indices] = b_dict[np.random.choice(b_dict.size, np.sum(~a1_indices), p=[0, 0.8, 0.2])] - - a1b1_indices = np.logical_and(a_values == 'a1', b_values == 'b1') - a1b2_indices = np.logical_and(a_values == 'a1', b_values == 'b2') - a1b3_indices = np.logical_and(a_values == 'a1', b_values == 'b3') - a2b1_indices = np.logical_and(a_values == 'a2', b_values == 'b1') - a2b2_indices = np.logical_and(a_values == 'a2', b_values == 'b2') - a2b3_indices = np.logical_and(a_values == 'a2', b_values == 'b3') - - c_values[a1b1_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b1_indices), p=[0.5, 0.5])] - c_values[a1b2_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b2_indices), p=[0.75, 0.25])] - c_values[a1b3_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b3_indices), p=[0.2, 0.8])] - c_values[a2b1_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b1_indices), p=[1, 0])] - c_values[a2b2_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b2_indices), p=[0, 1])] - c_values[a2b3_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b3_indices), p=[0.01, 0.99])] - - c1_indices = c_values == 'c1' - c2_indices = c_values == 'c2' - - d_values[c1_indices] = d_dict[np.random.choice(d_dict.size, np.sum(c1_indices), p=[0.25, 0.25, 0.25, 0.25])] - d_values[c2_indices] = d_dict[np.random.choice(d_dict.size, np.sum(c2_indices), p=[0.7, 0, 0.15, 0.15])] - - return pd.DataFrame({'A': a_values, - 'B': b_values, - 'C': c_values, - 'D': d_values - }, dtype='category') + a1_indices = a_values == "a1" + + b_values[a1_indices] = b_dict[ + np.random.choice(b_dict.size, np.sum(a1_indices), p=[0.33, 0.33, 0.34]) + ] + b_values[~a1_indices] = b_dict[ + np.random.choice(b_dict.size, np.sum(~a1_indices), p=[0, 0.8, 0.2]) + ] + + a1b1_indices = np.logical_and(a_values == "a1", b_values == "b1") + a1b2_indices = np.logical_and(a_values == "a1", b_values == "b2") + a1b3_indices = np.logical_and(a_values == "a1", b_values == "b3") + a2b1_indices = np.logical_and(a_values == "a2", b_values == "b1") + a2b2_indices = np.logical_and(a_values == "a2", b_values == "b2") + a2b3_indices = np.logical_and(a_values == "a2", b_values == "b3") + + c_values[a1b1_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a1b1_indices), p=[0.5, 0.5]) + ] + c_values[a1b2_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a1b2_indices), p=[0.75, 0.25]) + ] + c_values[a1b3_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a1b3_indices), p=[0.2, 0.8]) + ] + c_values[a2b1_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a2b1_indices), p=[1, 0]) + ] + c_values[a2b2_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a2b2_indices), p=[0, 1]) + ] + c_values[a2b3_indices] = c_dict[ + np.random.choice(c_dict.size, np.sum(a2b3_indices), p=[0.01, 0.99]) + ] + + c1_indices = c_values == "c1" + c2_indices = c_values == "c2" + + d_values[c1_indices] = d_dict[ + np.random.choice(d_dict.size, np.sum(c1_indices), p=[0.25, 0.25, 0.25, 0.25]) + ] + d_values[c2_indices] = d_dict[ + np.random.choice(d_dict.size, np.sum(c2_indices), p=[0.7, 0, 0.15, 0.15]) + ] + + return pd.DataFrame( + {"A": a_values, "B": b_values, "C": c_values, "D": d_values}, dtype="category" + ) def generate_hybrid_data(size, seed=0): @@ -107,52 +127,78 @@ def generate_hybrid_data(size, seed=0): # D np.random.seed(seed) - a_dict = np.asarray(['a1', 'a2']) + a_dict = np.asarray(["a1", "a2"]) a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] - b_dict = np.asarray(['b1', 'b2', 'b3']) + b_dict = np.asarray(["b1", "b2", "b3"]) b_values = b_dict[np.random.choice(b_dict.size, size, p=[0.3, 0.4, 0.3])] c_values = -4.2 + np.random.normal(0, 0.75, size=size) - a1b1_indices = np.logical_and(a_values == 'a1', b_values == 'b1') - a1b2_indices = np.logical_and(a_values == 'a1', b_values == 'b2') - a1b3_indices = np.logical_and(a_values == 'a1', b_values == 'b3') - a2b1_indices = np.logical_and(a_values == 'a2', b_values == 'b1') - a2b2_indices = np.logical_and(a_values == 'a2', b_values == 'b2') - a2b3_indices = np.logical_and(a_values == 'a2', b_values == 'b3') + a1b1_indices = np.logical_and(a_values == "a1", b_values == "b1") + a1b2_indices = np.logical_and(a_values == "a1", b_values == "b2") + a1b3_indices = np.logical_and(a_values == "a1", b_values == "b3") + a2b1_indices = np.logical_and(a_values == "a2", b_values == "b1") + a2b2_indices = np.logical_and(a_values == "a2", b_values == "b2") + a2b3_indices = np.logical_and(a_values == "a2", b_values == "b3") d_values = np.empty_like(c_values) d_values[a1b1_indices] = np.random.normal(1, 0.75, size=a1b1_indices.sum()) - d_values[a1b2_indices] = -2 + c_values[a1b2_indices] + np.random.normal(0, 2, size=a1b2_indices.sum()) - d_values[a1b3_indices] = -1 + 3*c_values[a1b3_indices] + np.random.normal(0, 0.25, size=a1b3_indices.sum()) + d_values[a1b2_indices] = ( + -2 + c_values[a1b2_indices] + np.random.normal(0, 2, size=a1b2_indices.sum()) + ) + d_values[a1b3_indices] = ( + -1 + + 3 * c_values[a1b3_indices] + + np.random.normal(0, 0.25, size=a1b3_indices.sum()) + ) d_values[a2b1_indices] = np.random.normal(2, 1, size=a2b1_indices.sum()) - d_values[a2b2_indices] = 3.5 + -1.2*c_values[a2b2_indices] + np.random.normal(0, 1, size=a2b2_indices.sum()) - d_values[a2b3_indices] = 4.8 + -2*c_values[a2b3_indices] + np.random.normal(0, 1.5, size=a2b3_indices.sum()) + d_values[a2b2_indices] = ( + 3.5 + + -1.2 * c_values[a2b2_indices] + + np.random.normal(0, 1, size=a2b2_indices.sum()) + ) + d_values[a2b3_indices] = ( + 4.8 + + -2 * c_values[a2b3_indices] + + np.random.normal(0, 1.5, size=a2b3_indices.sum()) + ) + + return pd.DataFrame( + { + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": c_values, + "D": d_values, + } + ) - return pd.DataFrame({'A': pd.Series(a_values, dtype='category'), - 'B': pd.Series(b_values, dtype='category'), - 'C': c_values, - 'D': d_values - }) def generate_indep_hybrid_data(size, seed=0): np.random.seed(seed) - d2_dict = np.asarray(['a1', 'a2']) + d2_dict = np.asarray(["a1", "a2"]) d2_values = d2_dict[np.random.choice(d2_dict.size, size, p=[0.5, 0.5])] - d3_dict = np.asarray(['b1', 'b2', 'b3']) + d3_dict = np.asarray(["b1", "b2", "b3"]) d3_values = d3_dict[np.random.choice(d3_dict.size, size, p=[0.33, 0.34, 0.33])] - d4_dict = np.asarray(['c1', 'c2', 'c3', 'c4']) - d4_values = d4_dict[np.random.choice(d4_dict.size, size, p=[0.25, 0.25, 0.25, 0.25])] + d4_dict = np.asarray(["c1", "c2", "c3", "c4"]) + d4_values = d4_dict[ + np.random.choice(d4_dict.size, size, p=[0.25, 0.25, 0.25, 0.25]) + ] - d5_dict = np.asarray(['d1', 'd2', 'd3', 'd4', 'd5']) - d5_values = d5_dict[np.random.choice(d5_dict.size, size, p=[0.2, 0.2, 0.2, 0.2, 0.2])] + d5_dict = np.asarray(["d1", "d2", "d3", "d4", "d5"]) + d5_values = d5_dict[ + np.random.choice(d5_dict.size, size, p=[0.2, 0.2, 0.2, 0.2, 0.2]) + ] - d6_dict = np.asarray(['e1', 'e2', 'e3', 'e4', 'e5', 'e6']) - d6_values = d6_dict[np.random.choice(d6_dict.size, size, p=[0.166, 0.166, 0.166, 0.166, 0.166, 0.17])] + d6_dict = np.asarray(["e1", "e2", "e3", "e4", "e5", "e6"]) + d6_values = d6_dict[ + np.random.choice( + d6_dict.size, size, p=[0.166, 0.166, 0.166, 0.166, 0.166, 0.17] + ) + ] c1_values = -4.2 + np.random.normal(0, 0.75, size=size) c2_values = np.random.normal(1, 2, size=size) @@ -161,15 +207,18 @@ def generate_indep_hybrid_data(size, seed=0): c5_values = np.random.normal(-1.2, 0.5, size=size) c6_values = np.random.normal(3, 1.5, size=size) - return pd.DataFrame({'D2': pd.Series(d2_values, dtype='category'), - 'D3': pd.Series(d3_values, dtype='category'), - 'D4': pd.Series(d4_values, dtype='category'), - 'D5': pd.Series(d5_values, dtype='category'), - 'D6': pd.Series(d6_values, dtype='category'), - 'C1': c1_values, - 'C2': c2_values, - 'C3': c3_values, - 'C4': c4_values, - 'C5': c5_values, - 'C6': c6_values, - }) \ No newline at end of file + return pd.DataFrame( + { + "D2": pd.Series(d2_values, dtype="category"), + "D3": pd.Series(d3_values, dtype="category"), + "D4": pd.Series(d4_values, dtype="category"), + "D5": pd.Series(d5_values, dtype="category"), + "D6": pd.Series(d6_values, dtype="category"), + "C1": c1_values, + "C2": c2_values, + "C3": c3_values, + "C4": c4_values, + "C5": c5_values, + "C6": c6_values, + } + ) diff --git a/tests/learning/algorithms/constraint_test.py b/tests/learning/algorithms/constraint_test.py index 91fca663..4f22a497 100644 --- a/tests/learning/algorithms/constraint_test.py +++ b/tests/learning/algorithms/constraint_test.py @@ -1,38 +1,50 @@ from pybnesian import PartiallyDirectedGraph, MeekRules + def test_meek_rule1(): # From Koller Chapter 3.4, Figure 3.12, pag 89. gr1 = PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y")], [("Y", "Z")]) assert MeekRules.rule1(gr1) assert gr1.num_edges() == 0 - assert set(gr1.arcs()) == set([('X', 'Y'), ('Y', 'Z')]) + assert set(gr1.arcs()) == set([("X", "Y"), ("Y", "Z")]) assert not MeekRules.rule1(gr1) + def test_meek_rule2(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr2 = PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y"), ("Y", "Z")], [("X", "Z")]) + gr2 = PartiallyDirectedGraph( + ["X", "Y", "Z"], [("X", "Y"), ("Y", "Z")], [("X", "Z")] + ) assert MeekRules.rule2(gr2) assert gr2.num_edges() == 0 - assert set(gr2.arcs()) == set([('X', 'Y'), ('Y', 'Z'), ('X', 'Z')]) + assert set(gr2.arcs()) == set([("X", "Y"), ("Y", "Z"), ("X", "Z")]) assert not MeekRules.rule2(gr2) + def test_meek_rule3(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr3 = PartiallyDirectedGraph(["X", "Y1", "Y2", "Z"], [("Y1", "Z"), ("Y2", "Z")], [("X", "Y1"), ("X", "Y2"), ("X", "Z")]) + gr3 = PartiallyDirectedGraph( + ["X", "Y1", "Y2", "Z"], + [("Y1", "Z"), ("Y2", "Z")], + [("X", "Y1"), ("X", "Y2"), ("X", "Z")], + ) assert MeekRules.rule3(gr3) - assert set(gr3.edges()) == set([('X', 'Y1'), ('X', 'Y2')]) - assert set(gr3.arcs()) == set([('X', 'Z'), ('Y1', 'Z'), ('Y2', 'Z')]) + assert set(gr3.edges()) == set([("X", "Y1"), ("X", "Y2")]) + assert set(gr3.arcs()) == set([("X", "Z"), ("Y1", "Z"), ("Y2", "Z")]) assert not MeekRules.rule3(gr3) + def test_meek_sequential(): # From Koller Chapter 3.4, Figure 3.13, pag 90. - koller = PartiallyDirectedGraph(["A", "B", "C", "D", "E", "F", "G"], - [("B", "E"), ("C", "E")], - [("A", "B"), ("B", "D"), ("C", "F"), ("E", "F"), ("F", "G")]) + koller = PartiallyDirectedGraph( + ["A", "B", "C", "D", "E", "F", "G"], + [("B", "E"), ("C", "E")], + [("A", "B"), ("B", "D"), ("C", "F"), ("E", "F"), ("F", "G")], + ) changed = True while changed: changed = False @@ -40,5 +52,7 @@ def test_meek_sequential(): changed = changed or MeekRules.rule2(koller) changed = changed or MeekRules.rule3(koller) - assert set(koller.edges()) == set([('A', 'B'), ('B', 'D')]) - assert set(koller.arcs()) == set([('B', 'E'), ('C', 'E'), ('E', 'F'), ('C', 'F'), ('F', 'G')]) \ No newline at end of file + assert set(koller.edges()) == set([("A", "B"), ("B", "D")]) + assert set(koller.arcs()) == set( + [("B", "E"), ("C", "E"), ("E", "F"), ("C", "F"), ("F", "G")] + ) diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 4b9ca490..8764d5bb 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -5,17 +5,18 @@ df = util_test.generate_normal_data(1000) + def test_hc_estimate(): bic = pbn.BIC(df) column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) # Check algorithm with BN with nodes removed. - column_names.insert(1, 'e') - column_names.insert(3, 'f') + column_names.insert(1, "e") + column_names.insert(3, "f") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_node('f') + start_removed_nodes.remove_node("e") + start_removed_nodes.remove_node("f") arc_set = pbn.ArcOperatorSet() @@ -38,39 +39,56 @@ def test_hc_estimate(): reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) + res_removed = hc.estimate( + arc_set, + bic, + start_removed_nodes, + max_iters=1, + arc_blacklist=[added_arc_removed], + ) assert res.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert added_arc_removed == reversed_arc_removed - assert np.isclose(op_delta, bic.local_score(res, added_arc[1], [added_arc[0]]) - bic.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - bic.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc[1], [added_arc[0]]) + - bic.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - bic.local_score(res, added_arc_removed[1], []), + ) res = hc.estimate(arc_set, bic, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() - # Can't compare models because the arcs could be oriented in different direction, + # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, bic, start, verbose=False) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, verbose=False) + def test_hc_conditional_estimate(): bic = pbn.BIC(df) column_names = list(df.columns.values) start = pbn.ConditionalGaussianNetwork(column_names[2:], column_names[:2]) - + nodes = column_names[2:] - nodes.insert(1, 'e') + nodes.insert(1, "e") interface_nodes = column_names[:2] - interface_nodes.insert(1, 'f') + interface_nodes.insert(1, "f") start_removed_nodes = pbn.ConditionalGaussianNetwork(nodes, interface_nodes) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_interface_node('f') - + start_removed_nodes.remove_node("e") + start_removed_nodes.remove_interface_node("f") + arc_set = pbn.ArcOperatorSet() hc = pbn.GreedyHillClimbing() @@ -79,37 +97,50 @@ def test_hc_conditional_estimate(): added_arc = res.arcs()[0] op_delta = bic.score(res) - bic.score(start) - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1, verbose=False) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, max_iters=1, verbose=False + ) assert res_removed.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[::-1] assert np.isclose(op_delta, bic.score(res_removed) - bic.score(start_removed_nodes)) - assert np.isclose(op_delta, bic.local_score(res, added_arc[1], [added_arc[0]]) - - bic.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - - bic.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc[1], [added_arc[0]]) + - bic.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - bic.local_score(res, added_arc_removed[1], []), + ) res = hc.estimate(arc_set, bic, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() res = hc.estimate(arc_set, bic, start, verbose=False) - assert all(map(lambda arc : not res.is_interface(arc[1]), res.arcs())) + assert all(map(lambda arc: not res.is_interface(arc[1]), res.arcs())) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, verbose=False) - assert all(map(lambda arc : not res_removed.is_interface(arc[1]), res_removed.arcs())) + assert all( + map(lambda arc: not res_removed.is_interface(arc[1]), res_removed.arcs()) + ) + def test_hc_estimate_validation(): column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) - column_names.insert(1, 'e') - column_names.insert(4, 'f') + column_names.insert(1, "e") + column_names.insert(4, "f") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_node('f') - + start_removed_nodes.remove_node("e") + start_removed_nodes.remove_node("f") + vl = pbn.ValidatedLikelihood(df) arc_set = pbn.ArcOperatorSet() @@ -124,12 +155,20 @@ def test_hc_estimate_validation(): assert res_removed.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[::-1] - assert np.isclose(op_delta, vl.cv_lik.score(res_removed) - vl.cv_lik.score(start_removed_nodes)) - - assert np.isclose(op_delta, vl.cv_lik.local_score(res, added_arc[1], [added_arc[0]]) - - vl.cv_lik.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, vl.cv_lik.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - - vl.cv_lik.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, vl.cv_lik.score(res_removed) - vl.cv_lik.score(start_removed_nodes) + ) + + assert np.isclose( + op_delta, + vl.cv_lik.local_score(res, added_arc[1], [added_arc[0]]) + - vl.cv_lik.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + vl.cv_lik.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - vl.cv_lik.local_score(res, added_arc_removed[1], []), + ) # CV is score equivalent for GBNs, so if we blacklist the added_edge, its reverse will be added. res = hc.estimate(arc_set, vl, start, max_iters=1, arc_blacklist=[added_arc]) @@ -137,29 +176,37 @@ def test_hc_estimate_validation(): reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc - res_removed = hc.estimate(arc_set, vl, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) + res_removed = hc.estimate( + arc_set, vl, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed] + ) assert res_removed.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert reversed_arc == reversed_arc_removed - + res = hc.estimate(arc_set, vl, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, vl, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, vl, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() - # Can't compare models because the arcs could be oriented in different direction, + # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, vl, start, verbose=False) res_removed = hc.estimate(arc_set, vl, start_removed_nodes, verbose=False) + def test_hc_shortcut_function(): model = pbn.hc(df, bn_type=pbn.GaussianNetworkType()) assert type(model) == pbn.GaussianNetwork - model = pbn.hc(df, bn_type=MyRestrictedGaussianNetworkType(), score="bic", operators=["arcs"]) + model = pbn.hc( + df, bn_type=MyRestrictedGaussianNetworkType(), score="bic", operators=["arcs"] + ) assert type(model) == NewBN + class MyRestrictedGaussianNetworkType(BayesianNetworkType): def __init__(self): BayesianNetworkType.__init__(self) @@ -179,12 +226,15 @@ def new_bn(self, nodes): def __str__(self): return "MyRestrictedGaussianNetworkType" + class NewBN(BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) + BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) self.extra_data = "extra" @@ -194,6 +244,7 @@ def __getstate_extra__(self): def __setstate_extra__(self, extra): self.extra_data = extra + def test_newbn_estimate_validation(): start = NewBN(["a", "b", "c", "d"]) hc = pbn.GreedyHillClimbing() @@ -203,4 +254,4 @@ def test_newbn_estimate_validation(): estimated = hc.estimate(arc, bic, start) assert type(start) == type(estimated) - assert estimated.extra_data == "extra" \ No newline at end of file + assert estimated.extra_data == "extra" diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 60f05f71..e167e1d8 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -5,6 +5,7 @@ SIZE = 10000 df = util_test.generate_normal_data(SIZE) + def test_create(): arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() @@ -14,16 +15,17 @@ def test_create(): pool = pbn.OperatorPool([]) assert "cannot be empty" in str(ex.value) + def test_find_max(): - spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["a", "b", "c", "d"]) cv = pbn.CVLikelihood(df) arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() - + arcs.cache_scores(spbn, cv) spbn.set_unknown_node_types(df) node_type.cache_scores(spbn, cv) - + arcs_max = arcs.find_max(spbn) node_max = node_type.find_max(spbn) @@ -36,5 +38,3 @@ def test_find_max(): assert op_combined == arcs_max else: assert op_combined == node_max - - \ No newline at end of file diff --git a/tests/learning/operators/operators_test.py b/tests/learning/operators/operators_test.py index f0c7cf5e..b0f7a070 100644 --- a/tests/learning/operators/operators_test.py +++ b/tests/learning/operators/operators_test.py @@ -1,105 +1,108 @@ import pytest import pybnesian as pbn + def test_create(): o = pbn.AddArc("a", "b", 1) - assert o.source() == 'a' - assert o.target() == 'b' + assert o.source() == "a" + assert o.target() == "b" assert o.delta() == 1 o = pbn.RemoveArc("a", "b", 2) - assert o.source() == 'a' - assert o.target() == 'b' + assert o.source() == "a" + assert o.target() == "b" assert o.delta() == 2 o = pbn.FlipArc("a", "b", 3) - assert o.source() == 'a' - assert o.target() == 'b' + assert o.source() == "a" + assert o.target() == "b" assert o.delta() == 3 o = pbn.ChangeNodeType("a", pbn.CKDEType(), 4) - assert o.node() == 'a' + assert o.node() == "a" assert o.node_type() == pbn.CKDEType() assert o.delta() == 4 + def test_apply(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) assert gbn.num_arcs() == 0 - assert not gbn.has_arc('a', 'b') + assert not gbn.has_arc("a", "b") o = pbn.AddArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert gbn.has_arc('a', 'b') - + assert gbn.has_arc("a", "b") + o = pbn.FlipArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert not gbn.has_arc('a', 'b') - assert gbn.has_arc('b', 'a') + assert not gbn.has_arc("a", "b") + assert gbn.has_arc("b", "a") o = pbn.RemoveArc("b", "a", 1) o.apply(gbn) assert gbn.num_arcs() == 0 - assert not gbn.has_arc('b', 'a') + assert not gbn.has_arc("b", "a") o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) with pytest.raises(ValueError) as ex: o.apply(gbn) assert "Wrong factor type" in str(ex.value) - spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["a", "b", "c", "d"]) assert spbn.num_arcs() == 0 o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) - assert(spbn.node_type('a') == pbn.UnknownFactorType()) + assert spbn.node_type("a") == pbn.UnknownFactorType() o.apply(spbn) - assert(spbn.node_type('a') == pbn.CKDEType()) + assert spbn.node_type("a") == pbn.CKDEType() - assert not spbn.has_arc('a', 'b') + assert not spbn.has_arc("a", "b") o = pbn.AddArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert spbn.has_arc('a', 'b') - + assert spbn.has_arc("a", "b") + o = pbn.FlipArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert not spbn.has_arc('a', 'b') - assert spbn.has_arc('b', 'a') + assert not spbn.has_arc("a", "b") + assert spbn.has_arc("b", "a") o = pbn.RemoveArc("b", "a", 1) o.apply(spbn) assert spbn.num_arcs() == 0 - assert not spbn.has_arc('b', 'a') + assert not spbn.has_arc("b", "a") + def test_opposite(): bn = pbn.SemiparametricBN(["a", "b"]) o = pbn.AddArc("a", "b", 1) oppo = o.opposite(bn) - assert oppo.source() == 'a' - assert oppo.target() == 'b' + assert oppo.source() == "a" + assert oppo.target() == "b" assert oppo.delta() == -1 assert type(oppo) == pbn.RemoveArc o = pbn.RemoveArc("a", "b", 1) oppo = o.opposite(bn) - assert oppo.source() == 'a' - assert oppo.target() == 'b' + assert oppo.source() == "a" + assert oppo.target() == "b" assert oppo.delta() == -1 assert type(oppo) == pbn.AddArc o = pbn.FlipArc("a", "b", 1) oppo = o.opposite(bn) - assert oppo.source() == 'b' - assert oppo.target() == 'a' + assert oppo.source() == "b" + assert oppo.target() == "a" assert oppo.delta() == -1 assert type(oppo) == pbn.FlipArc bn.set_node_type("a", pbn.LinearGaussianCPDType()) o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) oppo = o.opposite(bn) - assert oppo.node() == 'a' + assert oppo.node() == "a" assert oppo.node_type() == pbn.LinearGaussianCPDType() assert oppo.delta() == -1 - assert type(oppo) == pbn.ChangeNodeType \ No newline at end of file + assert type(oppo) == pbn.ChangeNodeType diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index a29a3af2..a1ef7d6c 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -6,9 +6,10 @@ SIZE = 10000 df = util_test.generate_normal_data(SIZE) + def test_create_change_node(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) - + gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) + cv = pbn.CVLikelihood(df) node_op = pbn.ChangeNodeTypeSet() @@ -17,8 +18,9 @@ def test_create_change_node(): node_op.cache_scores(gbn, cv) assert "can only be used with non-homogeneous" in str(ex.value) + def test_lists(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() @@ -43,7 +45,7 @@ def test_lists(): def test_check_max_score(): - gbn = pbn.GaussianNetwork(['c', 'd']) + gbn = pbn.GaussianNetwork(["c", "d"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() @@ -51,20 +53,23 @@ def test_check_max_score(): arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) - assert np.isclose(op.delta(), (bic.local_score(gbn, 'd', ['c']) - bic.local_score(gbn, 'd'))) + assert np.isclose( + op.delta(), (bic.local_score(gbn, "d", ["c"]) - bic.local_score(gbn, "d")) + ) # BIC is decomposable so the best operation is the arc in reverse direction. arc_op.set_arc_blacklist([(op.source(), op.target())]) arc_op.cache_scores(gbn, bic) - + op2 = arc_op.find_max(gbn) assert op.source() == op2.target() assert op.target() == op2.source() assert (type(op) == type(op2)) and (type(op) == pbn.AddArc) + def test_nomax(): - gbn = pbn.GaussianNetwork(['a', 'b']) + gbn = pbn.GaussianNetwork(["a", "b"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet(whitelist=[("a", "b")]) @@ -73,6 +78,3 @@ def test_nomax(): op = arc_op.find_max(gbn) assert op is None - - - diff --git a/tests/learning/operators/operatorstabuset_test.py b/tests/learning/operators/operatorstabuset_test.py index eae3ade1..be7bdfa0 100644 --- a/tests/learning/operators/operatorstabuset_test.py +++ b/tests/learning/operators/operatorstabuset_test.py @@ -1,14 +1,15 @@ import pybnesian as pbn + def test_OperatorTabuSet(): tabu_set = pbn.OperatorTabuSet() assert tabu_set.empty() - assert not tabu_set.contains(pbn.AddArc("a", "b", 1)) + assert not tabu_set.contains(pbn.AddArc("a", "b", 1)) tabu_set.insert(pbn.AddArc("a", "b", 2)) assert not tabu_set.empty() - assert tabu_set.contains(pbn.AddArc("a", "b", 3)) + assert tabu_set.contains(pbn.AddArc("a", "b", 3)) assert not tabu_set.contains(pbn.RemoveArc("b", "c", 4)) tabu_set.insert(pbn.RemoveArc("b", "c", 5)) @@ -20,5 +21,3 @@ def test_OperatorTabuSet(): tabu_set.clear() assert tabu_set.empty() - - diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 37699de6..3cbe6b84 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -20,11 +20,14 @@ def numpy_fit_mle_lg(data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) return beta, var + def test_mle_create(): with pytest.raises(ValueError) as ex: mle = pbn.MLE(pbn.CKDEType()) @@ -32,6 +35,7 @@ def test_mle_create(): mle = pbn.MLE(pbn.LinearGaussianCPDType()) + def test_mle_lg(): mle = pbn.MLE(pbn.LinearGaussianCPDType()) @@ -53,4 +57,4 @@ def test_mle_lg(): p = mle.estimate(df, "d", ["a", "b", "c"]) np_beta, np_var = numpy_fit_mle_lg(df, "d", ["a", "b", "c"]) assert np.all(np.isclose(p.beta, np_beta)) - assert np.isclose(p.variance, np_var) \ No newline at end of file + assert np.isclose(p.variance, np_var) diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 77bd4060..0030ffea 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -7,6 +7,7 @@ df = util_test.generate_normal_data(SIZE) + def numpy_local_score(data, variable, evidence): if isinstance(variable, str): node_data = data.loc[:, [variable] + evidence].dropna() @@ -20,32 +21,52 @@ def numpy_local_score(data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*evidence_data, axis=1) + means = beta[0] + np.sum(beta[1:] * evidence_data, axis=1) loglik = norm.logpdf(variable_data, means, np.sqrt(var)) return loglik.sum() - np.log(N) * 0.5 * (d + 2) + def test_bic_local_score(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + ["a", "b", "c", "d"], + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ) + bic = pbn.BIC(df) - - assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df, 'a', [])) - assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df, 'b', ['a'])) - assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df, 'c', ['a', 'b'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['a', 'b', 'c'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['b', 'c', 'a'])) - - assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) - assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) - assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) - assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose(bic.local_score(gbn, "a", []), numpy_local_score(df, "a", [])) + assert np.isclose( + bic.local_score(gbn, "b", ["a"]), numpy_local_score(df, "b", ["a"]) + ) + assert np.isclose( + bic.local_score(gbn, "c", ["a", "b"]), numpy_local_score(df, "c", ["a", "b"]) + ) + assert np.isclose( + bic.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(df, "d", ["a", "b", "c"]), + ) + assert np.isclose( + bic.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(df, "d", ["b", "c", "a"]), + ) + + assert bic.local_score(gbn, "a") == bic.local_score(gbn, "a", gbn.parents("a")) + assert bic.local_score(gbn, "b") == bic.local_score(gbn, "b", gbn.parents("b")) + assert bic.local_score(gbn, "c") == bic.local_score(gbn, "c", gbn.parents("c")) + assert bic.local_score(gbn, "d") == bic.local_score(gbn, "d", gbn.parents("d")) + def test_bic_local_score_null(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + ["a", "b", "c", "d"], + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -54,32 +75,51 @@ def test_bic_local_score_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan + bic = pbn.BIC(df_null) - - assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df_null, 'a', [])) - assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df_null, 'b', ['a'])) - assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df_null, 'c', ['a', 'b'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['b', 'c', 'a'])) - - assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) - assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) - assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) - assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose( + bic.local_score(gbn, "a", []), numpy_local_score(df_null, "a", []) + ) + assert np.isclose( + bic.local_score(gbn, "b", ["a"]), numpy_local_score(df_null, "b", ["a"]) + ) + assert np.isclose( + bic.local_score(gbn, "c", ["a", "b"]), + numpy_local_score(df_null, "c", ["a", "b"]), + ) + assert np.isclose( + bic.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(df_null, "d", ["a", "b", "c"]), + ) + assert np.isclose( + bic.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(df_null, "d", ["b", "c", "a"]), + ) + + assert bic.local_score(gbn, "a") == bic.local_score(gbn, "a", gbn.parents("a")) + assert bic.local_score(gbn, "b") == bic.local_score(gbn, "b", gbn.parents("b")) + assert bic.local_score(gbn, "c") == bic.local_score(gbn, "c", gbn.parents("c")) + assert bic.local_score(gbn, "d") == bic.local_score(gbn, "d", gbn.parents("d")) + def test_bic_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - - bic = pbn.BIC(df) - - assert np.isclose(bic.score(gbn), (bic.local_score(gbn, 'a', []) + - bic.local_score(gbn, 'b', ['a']) + - bic.local_score(gbn, 'c', ['a', 'b']) + - bic.local_score(gbn, 'd', ['a', 'b', 'c']))) + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + bic = pbn.BIC(df) + assert np.isclose( + bic.score(gbn), + ( + bic.local_score(gbn, "a", []) + + bic.local_score(gbn, "b", ["a"]) + + bic.local_score(gbn, "c", ["a", "b"]) + + bic.local_score(gbn, "d", ["a", "b", "c"]) + ), + ) diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index 95c922c5..5a012cb6 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -9,6 +9,7 @@ seed = 0 + def numpy_local_score(node_type, data, variable, evidence): cv = pbn.CrossValidation(data, 10, seed) loglik = 0 @@ -32,65 +33,96 @@ def numpy_local_score(node_type, data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*test_evidence_data, axis=1) + means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) loglik += norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() elif node_type == pbn.CKDEType(): - k_joint = gaussian_kde(node_data.to_numpy().T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + k_joint = gaussian_kde( + node_data.to_numpy().T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) if evidence: - k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) - loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) + k_marg = gaussian_kde( + evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor() + ) + loglik += np.sum( + k_joint.logpdf(test_node_data.to_numpy().T) + - k_marg.logpdf(test_evidence_data.to_numpy().T) + ) else: loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) return loglik + def test_cvl_create(): s = pbn.CVLikelihood(df) assert len(list(s.cv)) == 10 s = pbn.CVLikelihood(df, 5) assert len(list(s.cv)) == 5 - + s = pbn.CVLikelihood(df, 10, 0) assert len(list(s.cv)) == 10 s2 = pbn.CVLikelihood(df, 10, 0) assert len(list(s2.cv)) == 10 for (train_cv, test_cv), (train_cv2, test_cv2) in zip(s.cv, s2.cv): - assert train_cv.equals(train_cv2), "Train CV DataFrames with the same seed are not equal." - assert test_cv.equals(test_cv2), "Test CV DataFrames with the same seed are not equal." + assert train_cv.equals( + train_cv2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_cv.equals( + test_cv2 + ), "Test CV DataFrames with the same seed are not equal." with pytest.raises(ValueError) as ex: - s = pbn.CVLikelihood(df, SIZE+1) + s = pbn.CVLikelihood(df, SIZE + 1) assert "Cannot split" in str(ex.value) + def test_cvl_local_score_gbn(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + cvl = pbn.CVLikelihood(df, 10, seed) - - assert np.isclose(cvl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) - assert np.isclose(cvl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) - assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) - assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) - assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose( + cvl.local_score(gbn, "a", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "a", []), + ) + assert np.isclose( + cvl.local_score(gbn, "b", ["a"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score(gbn, "c", ["a", "b"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score(gbn, "d", ["a", "b", "c"]), + cvl.local_score(gbn, "d", ["b", "c", "a"]), + ) + + assert cvl.local_score(gbn, "a") == cvl.local_score(gbn, "a", gbn.parents("a")) + assert cvl.local_score(gbn, "b") == cvl.local_score(gbn, "b", gbn.parents("b")) + assert cvl.local_score(gbn, "c") == cvl.local_score(gbn, "c", gbn.parents("c")) + assert cvl.local_score(gbn, "d") == cvl.local_score(gbn, "d", gbn.parents("d")) + def test_cvl_local_score_gbn_null(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -98,67 +130,101 @@ def test_cvl_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) - assert np.isclose(cvl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) - assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) - assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) - assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + cvl.local_score(gbn, "a", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "a", []), + ) + assert np.isclose( + cvl.local_score(gbn, "b", ["a"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score(gbn, "c", ["a", "b"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score(gbn, "d", ["a", "b", "c"]), + cvl.local_score(gbn, "d", ["b", "c", "a"]), + ) + + assert cvl.local_score(gbn, "a") == cvl.local_score(gbn, "a", gbn.parents("a")) + assert cvl.local_score(gbn, "b") == cvl.local_score(gbn, "b", gbn.parents("b")) + assert cvl.local_score(gbn, "c") == cvl.local_score(gbn, "c", gbn.parents("c")) + assert cvl.local_score(gbn, "d") == cvl.local_score(gbn, "d", gbn.parents("d")) + def test_cvl_local_score_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) + cvl = pbn.CVLikelihood(df, 10, seed) - assert np.isclose(cvl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), df, 'a', [])) - assert np.isclose(cvl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) - assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) - assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) - assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) - - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), - numpy_local_score(pbn.CKDEType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df, 'd', ['b', 'c', 'a'])) + assert np.isclose( + cvl.local_score(spbn, "a", []), numpy_local_score(pbn.CKDEType(), df, "a", []) + ) + assert np.isclose( + cvl.local_score(spbn, "b", ["a"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score(spbn, "c", ["a", "b"]), + numpy_local_score(pbn.CKDEType(), df, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["b", "c", "a"]), + ) + + assert cvl.local_score(spbn, "a") == cvl.local_score(spbn, "a", spbn.parents("a")) + assert cvl.local_score(spbn, "b") == cvl.local_score(spbn, "b", spbn.parents("b")) + assert cvl.local_score(spbn, "c") == cvl.local_score(spbn, "c", spbn.parents("c")) + assert cvl.local_score(spbn, "d") == cvl.local_score(spbn, "d", spbn.parents("d")) + + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "a", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "a", []), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "b", ["a"]), + numpy_local_score(pbn.CKDEType(), df, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "c", ["a", "b"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), + numpy_local_score(pbn.CKDEType(), df, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), + numpy_local_score(pbn.CKDEType(), df, "d", ["b", "c", "a"]), + ) def test_cvl_local_score_null_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -166,56 +232,89 @@ def test_cvl_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) - assert np.isclose(cvl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) - assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) - assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) - assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) - - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), - numpy_local_score(pbn.CKDEType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df_null, 'd', ['b', 'c', 'a'])) + assert np.isclose( + cvl.local_score(spbn, "a", []), + numpy_local_score(pbn.CKDEType(), df_null, "a", []), + ) + assert np.isclose( + cvl.local_score(spbn, "b", ["a"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score(spbn, "c", ["a", "b"]), + numpy_local_score(pbn.CKDEType(), df_null, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["b", "c", "a"]), + ) + + assert cvl.local_score(spbn, "a") == cvl.local_score(spbn, "a", spbn.parents("a")) + assert cvl.local_score(spbn, "b") == cvl.local_score(spbn, "b", spbn.parents("b")) + assert cvl.local_score(spbn, "c") == cvl.local_score(spbn, "c", spbn.parents("c")) + assert cvl.local_score(spbn, "d") == cvl.local_score(spbn, "d", spbn.parents("d")) + + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "a", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "a", []), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "b", ["a"]), + numpy_local_score(pbn.CKDEType(), df_null, "b", ["a"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "c", ["a", "b"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "c", ["a", "b"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), + numpy_local_score(pbn.CKDEType(), df_null, "d", ["a", "b", "c"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), + numpy_local_score(pbn.CKDEType(), df_null, "d", ["b", "c", "a"]), + ) + def test_cvl_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) cv = pbn.CVLikelihood(df, 10, 0) - assert np.isclose(cv.score(gbn), ( - cv.local_score(gbn, 'a', []) + - cv.local_score(gbn, 'b', ['a']) + - cv.local_score(gbn, 'c', ['a', 'b']) + - cv.local_score(gbn, 'd', ['a', 'b', 'c']))) - - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - - assert np.isclose(cv.score(spbn), ( - cv.local_score(spbn, 'a') + - cv.local_score(spbn, 'b') + - cv.local_score(spbn, 'c') + - cv.local_score(spbn, 'd'))) \ No newline at end of file + assert np.isclose( + cv.score(gbn), + ( + cv.local_score(gbn, "a", []) + + cv.local_score(gbn, "b", ["a"]) + + cv.local_score(gbn, "c", ["a", "b"]) + + cv.local_score(gbn, "d", ["a", "b", "c"]) + ), + ) + + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) + + assert np.isclose( + cv.score(spbn), + ( + cv.local_score(spbn, "a") + + cv.local_score(spbn, "b") + + cv.local_score(spbn, "c") + + cv.local_score(spbn, "d") + ), + ) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 199889a3..5a2e051e 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -8,6 +8,7 @@ df = util_test.generate_normal_data(SIZE) seed = 0 + def numpy_local_score(node_type, training_data, test_data, variable, evidence): if isinstance(variable, str): node_data = training_data.loc[:, [variable] + evidence].dropna() @@ -28,20 +29,31 @@ def numpy_local_score(node_type, training_data, test_data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*test_evidence_data, axis=1) + means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) return norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() elif node_type == pbn.CKDEType(): - k_joint = gaussian_kde(node_data.to_numpy().T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + k_joint = gaussian_kde( + node_data.to_numpy().T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) if evidence: - k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) - return np.sum(k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) + k_marg = gaussian_kde( + evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor() + ) + return np.sum( + k_joint.logpdf(test_node_data.to_numpy().T) + - k_marg.logpdf(test_evidence_data.to_numpy().T) + ) else: return np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) + def test_holdout_create(): s = pbn.HoldoutLikelihood(df) assert s.training_data().num_rows == 0.8 * SIZE @@ -50,7 +62,7 @@ def test_holdout_create(): s = pbn.HoldoutLikelihood(df, 0.5) assert s.training_data().num_rows == 0.5 * SIZE assert s.test_data().num_rows == 0.5 * SIZE - + s = pbn.HoldoutLikelihood(df, 0.2, 0) s2 = pbn.HoldoutLikelihood(df, 0.2, 0) @@ -59,37 +71,76 @@ def test_holdout_create(): with pytest.raises(ValueError) as ex: s = pbn.HoldoutLikelihood(df, 10, 0) - assert "test_ratio must be a number" in str(ex.value) + assert "test_ratio must be a number" in str(ex.value) with pytest.raises(ValueError) as ex: s = pbn.HoldoutLikelihood(df, 0, 0) - assert "test_ratio must be a number" in str(ex.value) + assert "test_ratio must be a number" in str(ex.value) def test_holdout_local_score_gbn(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + hl = pbn.HoldoutLikelihood(df, 0.2, seed) - assert np.isclose(hl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - hl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) - assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) - assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) - assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + hl.local_score(gbn, "a", []), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "a", + [], + ), + ) + assert np.isclose( + hl.local_score(gbn, "b", ["a"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "b", + ["a"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "c", ["a", "b"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "c", + ["a", "b"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["a", "b", "c"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "d", ["a", "b", "c"]), + hl.local_score(gbn, "d", ["b", "c", "a"]), + ) + + assert hl.local_score(gbn, "a") == hl.local_score(gbn, "a", gbn.parents("a")) + assert hl.local_score(gbn, "b") == hl.local_score(gbn, "b", gbn.parents("b")) + assert hl.local_score(gbn, "c") == hl.local_score(gbn, "c", gbn.parents("c")) + assert hl.local_score(gbn, "d") == hl.local_score(gbn, "d", gbn.parents("d")) + def test_holdout_local_score_gbn_null(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -97,55 +148,135 @@ def test_holdout_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) - assert np.isclose(hl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - hl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) - assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) - assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) - assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + hl.local_score(gbn, "a", []), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "a", + [], + ), + ) + assert np.isclose( + hl.local_score(gbn, "b", ["a"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "b", + ["a"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "c", ["a", "b"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "c", + ["a", "b"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["a", "b", "c"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "d", ["a", "b", "c"]), + hl.local_score(gbn, "d", ["b", "c", "a"]), + ) + + assert hl.local_score(gbn, "a") == hl.local_score(gbn, "a", gbn.parents("a")) + assert hl.local_score(gbn, "b") == hl.local_score(gbn, "b", gbn.parents("b")) + assert hl.local_score(gbn, "c") == hl.local_score(gbn, "c", gbn.parents("c")) + assert hl.local_score(gbn, "d") == hl.local_score(gbn, "d", gbn.parents("d")) + def test_holdout_local_score_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) + hl = pbn.HoldoutLikelihood(df, 0.2, seed) - assert np.isclose(hl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) - - assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) - assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) - assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) - assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d')) + assert np.isclose( + hl.local_score(spbn, "a", []), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "a", + [], + ), + ) + assert np.isclose( + hl.local_score(spbn, "b", ["a"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "b", + ["a"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "c", ["a", "b"]), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "c", + ["a", "b"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["a", "b", "c"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["b", "c", "a"], + ), + ) + + assert hl.local_score(spbn, "a") == hl.local_score(spbn, "a", spbn.parents("a")) + assert hl.local_score(spbn, "b") == hl.local_score(spbn, "b", spbn.parents("b")) + assert hl.local_score(spbn, "c") == hl.local_score(spbn, "c", spbn.parents("c")) + assert hl.local_score(spbn, "d") == hl.local_score(spbn, "d", spbn.parents("d")) + def test_holdout_local_score_null_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -153,45 +284,98 @@ def test_holdout_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "a"] = np.nan + df_null.loc[df_null.index[b_null], "b"] = np.nan + df_null.loc[df_null.index[c_null], "c"] = np.nan + df_null.loc[df_null.index[d_null], "d"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) - assert np.isclose(hl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) - - assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) - assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) - assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) - assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d')) + assert np.isclose( + hl.local_score(spbn, "a", []), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "a", + [], + ), + ) + assert np.isclose( + hl.local_score(spbn, "b", ["a"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "b", + ["a"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "c", ["a", "b"]), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "c", + ["a", "b"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["a", "b", "c"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "d", ["a", "b", "c"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "d", + ["b", "c", "a"], + ), + ) + + assert hl.local_score(spbn, "a") == hl.local_score(spbn, "a", spbn.parents("a")) + assert hl.local_score(spbn, "b") == hl.local_score(spbn, "b", spbn.parents("b")) + assert hl.local_score(spbn, "c") == hl.local_score(spbn, "c", spbn.parents("c")) + assert hl.local_score(spbn, "d") == hl.local_score(spbn, "d", spbn.parents("d")) + def test_holdout_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) hl = pbn.HoldoutLikelihood(df, 0.2, 0) - assert np.isclose(hl.score(gbn), ( - hl.local_score(gbn, 'a', []) + - hl.local_score(gbn, 'b', ['a']) + - hl.local_score(gbn, 'c', ['a', 'b']) + - hl.local_score(gbn, 'd', ['a', 'b', 'c']))) + assert np.isclose( + hl.score(gbn), + ( + hl.local_score(gbn, "a", []) + + hl.local_score(gbn, "b", ["a"]) + + hl.local_score(gbn, "c", ["a", "b"]) + + hl.local_score(gbn, "d", ["a", "b", "c"]) + ), + ) - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) - assert np.isclose(hl.score(spbn), ( - hl.local_score(spbn, 'a') + - hl.local_score(spbn, 'b') + - hl.local_score(spbn, 'c') + - hl.local_score(spbn, 'd'))) \ No newline at end of file + assert np.isclose( + hl.score(spbn), + ( + hl.local_score(spbn, "a") + + hl.local_score(spbn, "b") + + hl.local_score(spbn, "c") + + hl.local_score(spbn, "d") + ), + ) diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index e835245e..09ff47d5 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -6,52 +6,59 @@ df = util_test.generate_normal_data(10000) + def test_create_bn(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = GaussianNetwork(["a", "b", "c", "d"]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 0 - assert gbn.nodes() == ['a', 'b', 'c', 'd'] + assert gbn.nodes() == ["a", "b", "c", "d"] - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'c')]) + gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 1 - assert gbn.nodes() == ['a', 'b', 'c', 'd'] + assert gbn.nodes() == ["a", "b", "c", "d"] - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 3 - assert gbn.nodes() == ['a', 'c', 'b', 'd'] + assert gbn.nodes() == ["a", "c", "b", "d"] with pytest.raises(TypeError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c'], [('a', 'c', 'b')]) + gbn = GaussianNetwork(["a", "b", "c"], [("a", "c", "b")]) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c'], [('a', 'd')]) + gbn = GaussianNetwork(["a", "b", "c"], [("a", "d")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork([('a', 'b'), ('b', 'c'), ('c', 'a')]) + gbn = GaussianNetwork([("a", "b"), ("b", "c"), ("c", "a")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c'), ('c', 'a')]) + gbn = GaussianNetwork( + ["a", "b", "c", "d"], [("a", "b"), ("b", "c"), ("c", "a")] + ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = BayesianNetwork(pbn.GaussianNetworkType(), ['a', 'b', 'c', 'd'], [], [('a', pbn.CKDEType())]) + gbn = BayesianNetwork( + pbn.GaussianNetworkType(), ["a", "b", "c", "d"], [], [("a", pbn.CKDEType())] + ) assert "Wrong factor type" in str(ex.value) - + + def gbn_generator(): # Test different Networks created with different constructors. - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = GaussianNetwork(["a", "b", "c", "d"]) yield gbn - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) yield gbn - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c')]) + gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "b"), ("b", "c")]) yield gbn + def test_nodes_util(): for gbn in gbn_generator(): assert gbn.num_nodes() == 4 @@ -59,167 +66,172 @@ def test_nodes_util(): nodes = gbn.nodes() indices = gbn.indices() - assert nodes[gbn.index('a')] == 'a' - assert nodes[gbn.index('b')] == 'b' - assert nodes[gbn.index('c')] == 'c' - assert nodes[gbn.index('d')] == 'd' + assert nodes[gbn.index("a")] == "a" + assert nodes[gbn.index("b")] == "b" + assert nodes[gbn.index("c")] == "c" + assert nodes[gbn.index("d")] == "d" assert indices[gbn.name(0)] == 0 assert indices[gbn.name(1)] == 1 assert indices[gbn.name(2)] == 2 assert indices[gbn.name(3)] == 3 - assert gbn.contains_node('a') - assert gbn.contains_node('b') - assert gbn.contains_node('c') - assert gbn.contains_node('d') - assert not gbn.contains_node('e') + assert gbn.contains_node("a") + assert gbn.contains_node("b") + assert gbn.contains_node("c") + assert gbn.contains_node("d") + assert not gbn.contains_node("e") + def test_parent_children(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 0 - assert gbn.num_parents('c') == 0 - assert gbn.num_parents('d') == 0 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == [] - assert gbn.parents('c') == [] - assert gbn.parents('d') == [] - - assert gbn.num_children('a') == 0 - assert gbn.num_children('b') == 0 - assert gbn.num_children('c') == 0 - assert gbn.num_children('d') == 0 - - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 0 - assert gbn.num_parents('c') == 1 - assert gbn.num_parents('d') == 2 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == [] - assert gbn.parents('c') == ['a'] - assert set(gbn.parents('d')) == set(['b', 'c']) - - assert gbn.num_children('a') == 1 - assert gbn.num_children('b') == 1 - assert gbn.num_children('c') == 1 - assert gbn.num_children('d') == 0 - - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c')]) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 1 - assert gbn.num_parents('c') == 1 - assert gbn.num_parents('d') == 0 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == ['a'] - assert gbn.parents('c') == ['b'] - assert gbn.parents('d') == [] - - assert gbn.num_children('a') == 1 - assert gbn.num_children('b') == 1 - assert gbn.num_children('c') == 0 - assert gbn.num_children('d') == 0 + gbn = GaussianNetwork(["a", "b", "c", "d"]) + + assert gbn.num_parents("a") == 0 + assert gbn.num_parents("b") == 0 + assert gbn.num_parents("c") == 0 + assert gbn.num_parents("d") == 0 + + assert gbn.parents("a") == [] + assert gbn.parents("b") == [] + assert gbn.parents("c") == [] + assert gbn.parents("d") == [] + + assert gbn.num_children("a") == 0 + assert gbn.num_children("b") == 0 + assert gbn.num_children("c") == 0 + assert gbn.num_children("d") == 0 + + gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) + + assert gbn.num_parents("a") == 0 + assert gbn.num_parents("b") == 0 + assert gbn.num_parents("c") == 1 + assert gbn.num_parents("d") == 2 + + assert gbn.parents("a") == [] + assert gbn.parents("b") == [] + assert gbn.parents("c") == ["a"] + assert set(gbn.parents("d")) == set(["b", "c"]) + + assert gbn.num_children("a") == 1 + assert gbn.num_children("b") == 1 + assert gbn.num_children("c") == 1 + assert gbn.num_children("d") == 0 + + gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "b"), ("b", "c")]) + + assert gbn.num_parents("a") == 0 + assert gbn.num_parents("b") == 1 + assert gbn.num_parents("c") == 1 + assert gbn.num_parents("d") == 0 + + assert gbn.parents("a") == [] + assert gbn.parents("b") == ["a"] + assert gbn.parents("c") == ["b"] + assert gbn.parents("d") == [] + + assert gbn.num_children("a") == 1 + assert gbn.num_children("b") == 1 + assert gbn.num_children("c") == 0 + assert gbn.num_children("d") == 0 + def test_arcs(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = GaussianNetwork(["a", "b", "c", "d"]) assert gbn.num_arcs() == 0 assert gbn.arcs() == [] - assert not gbn.has_arc('a', 'b') + assert not gbn.has_arc("a", "b") - gbn.add_arc('a', 'b') + gbn.add_arc("a", "b") assert gbn.num_arcs() == 1 - assert gbn.arcs() == [('a', 'b')] - assert gbn.parents('b') == ['a'] - assert gbn.num_parents('b') == 1 - assert gbn.num_children('a') == 1 - assert gbn.has_arc('a', 'b') + assert gbn.arcs() == [("a", "b")] + assert gbn.parents("b") == ["a"] + assert gbn.num_parents("b") == 1 + assert gbn.num_children("a") == 1 + assert gbn.has_arc("a", "b") - gbn.add_arc('b', 'c') + gbn.add_arc("b", "c") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c')]) - assert gbn.parents('c') == ['b'] - assert gbn.num_parents('c') == 1 - assert gbn.num_children('b') == 1 - assert gbn.has_arc('b', 'c') - - gbn.add_arc('d', 'c') + assert set(gbn.arcs()) == set([("a", "b"), ("b", "c")]) + assert gbn.parents("c") == ["b"] + assert gbn.num_parents("c") == 1 + assert gbn.num_children("b") == 1 + assert gbn.has_arc("b", "c") + + gbn.add_arc("d", "c") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c')]) - assert set(gbn.parents('c')) == set(['b', 'd']) - assert gbn.num_parents('c') == 2 - assert gbn.num_children('d') == 1 - assert gbn.has_arc('d', 'c') - - assert gbn.has_path('a', 'c') - assert not gbn.has_path('a', 'd') - assert gbn.has_path('b', 'c') - assert gbn.has_path('d', 'c') - - assert not gbn.can_add_arc('c', 'a') - # This edge exists, but virtually we consider that the addition is allowed. - assert gbn.can_add_arc('b', 'c') - assert gbn.can_add_arc('d', 'a') - - gbn.add_arc('b', 'd') + assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c")]) + assert set(gbn.parents("c")) == set(["b", "d"]) + assert gbn.num_parents("c") == 2 + assert gbn.num_children("d") == 1 + assert gbn.has_arc("d", "c") + + assert gbn.has_path("a", "c") + assert not gbn.has_path("a", "d") + assert gbn.has_path("b", "c") + assert gbn.has_path("d", "c") + + assert not gbn.can_add_arc("c", "a") + # This edge exists, but virtually we consider that the addition is allowed. + assert gbn.can_add_arc("b", "c") + assert gbn.can_add_arc("d", "a") + + gbn.add_arc("b", "d") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('d') == ['b'] - assert gbn.num_parents('d') == 1 - assert gbn.num_children('b') == 2 - assert gbn.has_arc('b', 'd') - - assert gbn.has_path('a', 'd') - assert not gbn.can_add_arc('d', 'a') - assert not gbn.can_flip_arc('b', 'c') - assert gbn.can_flip_arc('a', 'b') + assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c"), ("b", "d")]) + assert gbn.parents("d") == ["b"] + assert gbn.num_parents("d") == 1 + assert gbn.num_children("b") == 2 + assert gbn.has_arc("b", "d") + + assert gbn.has_path("a", "d") + assert not gbn.can_add_arc("d", "a") + assert not gbn.can_flip_arc("b", "c") + assert gbn.can_flip_arc("a", "b") # This edge does not exist, but it could be flipped if it did. - assert gbn.can_flip_arc('d', 'a') + assert gbn.can_flip_arc("d", "a") # We can add an edge twice without changes. - gbn.add_arc('b', 'd') + gbn.add_arc("b", "d") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('d') == ['b'] - assert gbn.num_parents('d') == 1 - assert gbn.num_children('b') == 2 - assert gbn.has_arc('b', 'd') + assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c"), ("b", "d")]) + assert gbn.parents("d") == ["b"] + assert gbn.num_parents("d") == 1 + assert gbn.num_children("b") == 2 + assert gbn.has_arc("b", "d") - gbn.remove_arc('b', 'c') + gbn.remove_arc("b", "c") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([('a', 'b'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('c') == ['d'] - assert gbn.num_parents('c') == 1 - assert gbn.num_children('b') == 1 - assert not gbn.has_arc('b', 'c') - - assert gbn.can_add_arc('b', 'c') - assert not gbn.can_add_arc('c', 'b') - assert gbn.has_path('a', 'c') - assert gbn.has_path('b', 'c') - - gbn.remove_arc('d', 'c') + assert set(gbn.arcs()) == set([("a", "b"), ("d", "c"), ("b", "d")]) + assert gbn.parents("c") == ["d"] + assert gbn.num_parents("c") == 1 + assert gbn.num_children("b") == 1 + assert not gbn.has_arc("b", "c") + + assert gbn.can_add_arc("b", "c") + assert not gbn.can_add_arc("c", "b") + assert gbn.has_path("a", "c") + assert gbn.has_path("b", "c") + + gbn.remove_arc("d", "c") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'd')]) - assert gbn.parents('c') == [] - assert gbn.num_parents('c') == 0 - assert gbn.num_children('d') == 0 - assert not gbn.has_arc('d', 'c') + assert set(gbn.arcs()) == set([("a", "b"), ("b", "d")]) + assert gbn.parents("c") == [] + assert gbn.num_parents("c") == 0 + assert gbn.num_children("d") == 0 + assert not gbn.has_arc("d", "c") + + assert gbn.can_add_arc("b", "c") + assert gbn.can_add_arc("c", "b") + assert not gbn.has_path("a", "c") + assert not gbn.has_path("b", "c") - assert gbn.can_add_arc('b', 'c') - assert gbn.can_add_arc('c', 'b') - assert not gbn.has_path('a', 'c') - assert not gbn.has_path('b', 'c') def test_bn_fit(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) with pytest.raises(ValueError) as ex: for n in gbn.nodes(): @@ -234,71 +246,85 @@ def test_bn_fit(): assert cpd.evidence() == gbn.parents(n) gbn.fit(df) - - gbn.remove_arc('a', 'b') - cpd_b = gbn.cpd('b') - assert cpd_b.evidence != gbn.parents('b') + gbn.remove_arc("a", "b") + + cpd_b = gbn.cpd("b") + assert cpd_b.evidence != gbn.parents("b") gbn.fit(df) - cpd_b = gbn.cpd('b') - assert cpd_b.evidence() == gbn.parents('b') + cpd_b = gbn.cpd("b") + assert cpd_b.evidence() == gbn.parents("b") + def test_add_cpds(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) + with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('e', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("e", [])]) assert "variable which is not present" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('a', ['e'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("a", ["e"])]) assert "Evidence variable" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('a', ['b'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("a", ["b"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('b', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("b", [])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('b', ['c'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("b", ["c"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) - lg = pbn.LinearGaussianCPD('b', ['a'], [2.5, 1.65], 4) + lg = pbn.LinearGaussianCPD("b", ["a"], [2.5, 1.65], 4) assert lg.fitted() gbn.add_cpds([lg]) - cpd_b = gbn.cpd('b') - assert cpd_b.variable() == 'b' - assert cpd_b.evidence() == ['a'] + cpd_b = gbn.cpd("b") + assert cpd_b.variable() == "b" + assert cpd_b.evidence() == ["a"] assert cpd_b.fitted() assert np.all(cpd_b.beta == np.asarray([2.5, 1.65])) assert cpd_b.variance == 4 with pytest.raises(ValueError) as ex: - cpd_a = gbn.cpd('a') - assert "CPD of variable \"a\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + cpd_a = gbn.cpd("a") + assert ( + 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - cpd_c = gbn.cpd('c') - assert "CPD of variable \"c\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + cpd_c = gbn.cpd("c") + assert ( + 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - cpd_d = gbn.cpd('d') - assert "CPD of variable \"d\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + cpd_d = gbn.cpd("d") + assert ( + 'CPD of variable "d" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('e', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("e", [])]) assert "variable which is not present" in str(ex.value) def test_bn_logl(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = GaussianNetwork( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) gbn.fit(df) @@ -308,7 +334,7 @@ def test_bn_logl(): sum_ll = np.zeros((5000,)) sum_sll = 0 - + for n in gbn.nodes(): cpd = gbn.cpd(n) l = cpd.logl(test_df) @@ -316,23 +342,27 @@ def test_bn_logl(): assert np.all(np.isclose(s, l.sum())) sum_ll += l sum_sll += s - + assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) assert sll == sum_sll + def test_bn_sample(): - gbn = GaussianNetwork(['a', 'c', 'b', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = GaussianNetwork( + ["a", "c", "b", "d"], + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ) gbn.fit(df) sample = gbn.sample(1000, 0, False) # Not ordered, so topological sort. - assert sample.schema.names == ['a', 'b', 'c', 'd'] + assert sample.schema.names == ["a", "b", "c", "d"] assert sample.num_rows == 1000 - + sample_ordered = gbn.sample(1000, 0, True) - assert sample_ordered.schema.names == ['a', 'c', 'b', 'd'] + assert sample_ordered.schema.names == ["a", "c", "b", "d"] assert sample_ordered.num_rows == 1000 assert sample.column(0).equals(sample_ordered.column(0)) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 7f661ec2..2ff8f4df 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,8 +1,16 @@ import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork, ConditionalBayesianNetwork, GaussianNetwork,\ - SemiparametricBN, KDENetwork, DiscreteBN +from pybnesian import ( + BayesianNetworkType, + BayesianNetwork, + ConditionalBayesianNetwork, + GaussianNetwork, + SemiparametricBN, + KDENetwork, + DiscreteBN, +) import util_test + def test_bn_type(): g1 = GaussianNetwork(["a", "b", "c", "d"]) g2 = GaussianNetwork(["a", "b", "c", "d"]) @@ -47,6 +55,7 @@ def test_bn_type(): assert s1.type() != d1.type() assert k1.type() != d1.type() + def test_new_bn_type(): class MyGaussianNetworkType(BayesianNetworkType): def __init__(self): @@ -69,7 +78,7 @@ def can_have_arc(self, model, source, target): class MySemiparametricBNType(BayesianNetworkType): def __init__(self): BayesianNetworkType.__init__(self) - + b1 = MySemiparametricBNType() b2 = MySemiparametricBNType() b3 = MySemiparametricBNType() @@ -104,19 +113,28 @@ def can_have_arc(self, model, source, target): def __str__(self): return "MyRestrictedGaussianNetworkType" + class SpecificNetwork(BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) + BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) + class ConditionalSpecificNetwork(ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface) + ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface, arcs) + ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface, arcs + ) + def test_new_specific_bn_type(): sp1 = SpecificNetwork(["a", "b", "c", "d"]) @@ -147,7 +165,7 @@ def test_new_specific_bn_type(): # ####################### # Conditional BN # ####################### - + csp1 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) csp2 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"], [("a", "b")]) csp3 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 892f96a2..db7677c5 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -4,11 +4,16 @@ import pandas as pd from scipy.stats import norm import pybnesian as pbn -from pybnesian import GaussianNetwork, ConditionalGaussianNetwork, DynamicGaussianNetwork +from pybnesian import ( + GaussianNetwork, + ConditionalGaussianNetwork, + DynamicGaussianNetwork, +) import util_test df = util_test.generate_normal_data(1000) + def test_create_dbn(): variables = ["a", "b", "c", "d"] gbn = DynamicGaussianNetwork(variables, 2) @@ -34,13 +39,18 @@ def test_create_dbn(): with pytest.raises(ValueError) as ex: gbn3 = DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) - assert "Static and transition Bayesian networks do not have the same type" in str(ex.value) + assert "Static and transition Bayesian networks do not have the same type" in str( + ex.value + ) wrong_static_bn = pbn.DiscreteBN(static_nodes) with pytest.raises(ValueError) as ex: - gbn4 = DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) + gbn4 = DynamicGaussianNetwork( + variables, 2, wrong_static_bn, wrong_transition_bn + ) assert "Bayesian networks are not Gaussian." in str(ex.value) + def test_variable_operations_dbn(): variables = ["a", "b", "c", "d"] gbn = DynamicGaussianNetwork(variables, 2) @@ -58,14 +68,22 @@ def test_variable_operations_dbn(): assert set(gbn.variables()) == set(["a", "b", "c", "d", "e"]) assert gbn.num_variables() == 5 - assert set(gbn.static_bn().nodes()) == set([v + "_t_" + str(m) for v in variables + ["e"] for m in range(1, 3)]) - assert set(gbn.transition_bn().nodes()) == set([v + "_t_0" for v in variables + ["e"]]) + assert set(gbn.static_bn().nodes()) == set( + [v + "_t_" + str(m) for v in variables + ["e"] for m in range(1, 3)] + ) + assert set(gbn.transition_bn().nodes()) == set( + [v + "_t_0" for v in variables + ["e"]] + ) gbn.remove_variable("b") assert set(gbn.variables()) == set(["a", "c", "d", "e"]) assert gbn.num_variables() == 4 - assert set(gbn.static_bn().nodes()) == set([v + "_t_" + str(m) for v in ["a", "c", "d", "e"] for m in range(1, 3)]) - assert set(gbn.transition_bn().nodes()) == set([v + "_t_0" for v in ["a", "c", "d", "e"]]) + assert set(gbn.static_bn().nodes()) == set( + [v + "_t_" + str(m) for v in ["a", "c", "d", "e"] for m in range(1, 3)] + ) + assert set(gbn.transition_bn().nodes()) == set( + [v + "_t_0" for v in ["a", "c", "d", "e"]] + ) def test_fit_dbn(): @@ -89,10 +107,12 @@ def test_fit_dbn(): assert gbn2.static_bn().fitted() assert gbn2.transition_bn().fitted() + def lg_logl_row(row, variable, evidence, beta, variance): m = beta[0] + beta[1:].dot(row[evidence]) return norm(m, np.sqrt(variance)).logpdf(row[variable]) + def static_logl(dbn, test_data, index, variable): sl = test_data.head(dbn.markovian_order()) @@ -102,16 +122,17 @@ def static_logl(dbn, test_data, index, variable): row_values = [sl.loc[index, variable]] for e in evidence: - m = re.search('(.*)_t_(\\d+)', e) + m = re.search("(.*)_t_(\\d+)", e) e_var = m[1] t = int(m[2]) - row_values.append(sl.loc[dbn.markovian_order()-t, e_var]) + row_values.append(sl.loc[dbn.markovian_order() - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) return lg_logl_row(r, node_name, evidence, cpd.beta, cpd.variance) + def transition_logl(dbn, test_data, index, variable): node_name = variable + "_t_0" cpd = dbn.transition_bn().cpd(node_name) @@ -119,11 +140,11 @@ def transition_logl(dbn, test_data, index, variable): row_values = [test_data.loc[index, variable]] for e in evidence: - m = re.search('(.*)_t_(\\d+)', e) + m = re.search("(.*)_t_(\\d+)", e) e_var = m[1] t = int(m[2]) - row_values.append(test_data.loc[index-t, e_var]) + row_values.append(test_data.loc[index - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) return lg_logl_row(r, node_name, evidence, cpd.beta, cpd.variance) @@ -142,11 +163,16 @@ def numpy_logl(dbn, test_data): return ll + def test_logl_dbn(): variables = ["a", "b", "c", "d"] - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) + static_bn = GaussianNetwork( + ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ) + static_bn = GaussianNetwork( + ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ) gbn = DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() @@ -174,11 +200,16 @@ def test_logl_dbn(): ll = gbn.logl(test_df) assert np.all(np.isclose(ground_truth_ll, ll)) + def test_slogl_dbn(): variables = ["a", "b", "c", "d"] - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) + static_bn = GaussianNetwork( + ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ) + static_bn = GaussianNetwork( + ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ) gbn = DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() @@ -202,4 +233,4 @@ def test_slogl_dbn(): gbn.fit(df) test_df = util_test.generate_normal_data(100) ll = numpy_logl(gbn, test_df) - assert np.isclose(gbn.slogl(test_df), ll.sum()) \ No newline at end of file + assert np.isclose(gbn.slogl(test_df), ll.sum()) diff --git a/tests/models/HeterogeneousBN_test.py b/tests/models/HeterogeneousBN_test.py index ca3614b7..227508f0 100644 --- a/tests/models/HeterogeneousBN_test.py +++ b/tests/models/HeterogeneousBN_test.py @@ -3,53 +3,71 @@ def test_type_equality(): - # + # # Test single vector types - # + # - het_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) - het2_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) + het_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + ) + het2_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + ) assert het_single.type() == het2_single.type() - het3_single = pbn.HeterogeneousBN([pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["a", "b", "c", "d"]) - + het3_single = pbn.HeterogeneousBN( + [pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["a", "b", "c", "d"] + ) + assert het_single.type() != het3_single.type() - # + # # Test a single vector type for each data type - # - - het_dt = pbn.HeterogeneousBN({ - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()] - }, ["a", "b", "c", "d"]) - - het2_dt = pbn.HeterogeneousBN({ - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], - pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) - + # + + het_dt = pbn.HeterogeneousBN( + { + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + }, + ["a", "b", "c", "d"], + ) + + het2_dt = pbn.HeterogeneousBN( + { + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + }, + ["a", "b", "c", "d"], + ) + # The order of the set is not relevant assert het_dt.type() == het2_dt.type() - het3_dt = pbn.HeterogeneousBN({ - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], - pa.float32(): [pbn.LinearGaussianCPDType(), pbn.CKDEType()], - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) + het3_dt = pbn.HeterogeneousBN( + { + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + pa.float32(): [pbn.LinearGaussianCPDType(), pbn.CKDEType()], + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + }, + ["a", "b", "c", "d"], + ) # The order of the default FactorTypes is relevant assert het_dt.type() != het3_dt.type() - - # + + # # Compare single vector and multi vector FactorTypes - het_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) - het_dt = pbn.HeterogeneousBN({ - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) + het_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + ) + het_dt = pbn.HeterogeneousBN( + {pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()]}, + ["a", "b", "c", "d"], + ) assert het_single.type() != het_dt.type() diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 8540b93f..91b634d5 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -6,112 +6,141 @@ df = util_test.generate_normal_data(10000) + def test_create_spbn(): - spbn = SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = SemiparametricBN(["a", "b", "c", "d"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["a", "b", "c", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'c')]) + spbn = SemiparametricBN(["a", "b", "c", "d"], [("a", "c")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["a", "b", "c", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN([('a', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = SemiparametricBN([("a", "c"), ("b", "d"), ("c", "d")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ['a', 'c', 'b', 'd'] + assert spbn.nodes() == ["a", "c", "b", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'c', 'b')]) + spbn = SemiparametricBN(["a", "b", "c"], [("a", "c", "b")]) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'd')]) + spbn = SemiparametricBN(["a", "b", "c"], [("a", "d")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([('a', 'b'), ('b', 'c'), ('c', 'a')]) + spbn = SemiparametricBN([("a", "b"), ("b", "c"), ("c", "a")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c'), ('c', 'a')]) + spbn = SemiparametricBN( + ["a", "b", "c", "d"], [("a", "b"), ("b", "c"), ("c", "a")] + ) assert "must be a DAG" in str(ex.value) + expected_node_type = { + "a": pbn.CKDEType(), + "b": pbn.UnknownFactorType(), + "c": pbn.CKDEType(), + "d": pbn.UnknownFactorType(), + } - expected_node_type = {'a': pbn.CKDEType(), - 'b': pbn.UnknownFactorType(), - 'c': pbn.CKDEType(), - 'd': pbn.UnknownFactorType()} - - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + ["a", "b", "c", "d"], [("a", pbn.CKDEType()), ("c", pbn.CKDEType())] + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["a", "b", "c", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'c')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + ["a", "b", "c", "d"], + [("a", "c")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["a", "b", "c", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN([('a', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + [("a", "c"), ("b", "d"), ("c", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ['a', 'c', 'b', 'd'] + assert spbn.nodes() == ["a", "c", "b", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'c', 'b')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + ["a", "b", "c"], + [("a", "c", "b")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + ["a", "b", "c"], + [("a", "d")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([('a', 'b'), ('b', 'c'), ('c', 'a')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + [("a", "b"), ("b", "c"), ("c", "a")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], - [('a', 'b'), ('b', 'c'), ('c', 'a')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = SemiparametricBN( + ["a", "b", "c", "d"], + [("a", "b"), ("b", "c"), ("c", "a")], + [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ) assert "must be a DAG" in str(ex.value) def test_node_type(): - spbn = SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = SemiparametricBN(["a", "b", "c", "d"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["a", "b", "c", "d"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - - spbn.set_node_type('b', pbn.CKDEType()) - assert spbn.node_type('b') == pbn.CKDEType() - spbn.set_node_type('b', pbn.LinearGaussianCPDType()) - assert spbn.node_type('b') == pbn.LinearGaussianCPDType() + + spbn.set_node_type("b", pbn.CKDEType()) + assert spbn.node_type("b") == pbn.CKDEType() + spbn.set_node_type("b", pbn.LinearGaussianCPDType()) + assert spbn.node_type("b") == pbn.LinearGaussianCPDType() + def test_fit(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) with pytest.raises(ValueError) as ex: for n in spbn.nodes(): @@ -129,81 +158,97 @@ def test_fit(): assert set(cpd.evidence()) == set(spbn.parents(n)) spbn.fit(df) - - spbn.remove_arc('a', 'b') - cpd_b = spbn.cpd('b') + spbn.remove_arc("a", "b") + + cpd_b = spbn.cpd("b") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence != spbn.parents('b') + assert cpd_b.evidence != spbn.parents("b") spbn.fit(df) - cpd_b = spbn.cpd('b') + cpd_b = spbn.cpd("b") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence() == spbn.parents('b') + assert cpd_b.evidence() == spbn.parents("b") - spbn.set_node_type('c', pbn.CKDEType()) + spbn.set_node_type("c", pbn.CKDEType()) with pytest.raises(ValueError) as ex: - cpd_c = spbn.cpd('c') + cpd_c = spbn.cpd("c") assert "not added" in str(ex.value) spbn.fit(df) - cpd_c = spbn.cpd('c') - assert cpd_c.type() == spbn.node_type('c') + cpd_c = spbn.cpd("c") + assert cpd_c.type() == spbn.node_type("c") def test_cpd(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) + spbn = SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("d", pbn.CKDEType())], + ) with pytest.raises(ValueError) as ex: - spbn.cpd('a') + spbn.cpd("a") assert "not added" in str(ex.value) spbn.fit(df) - assert spbn.cpd('a').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('b').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('c').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('d').type() == pbn.CKDEType() + assert spbn.cpd("a").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("b").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("c").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("d").type() == pbn.CKDEType() + + assert spbn.cpd("a").fitted() + assert spbn.cpd("b").fitted() + assert spbn.cpd("c").fitted() + assert spbn.cpd("d").fitted() - assert spbn.cpd('a').fitted() - assert spbn.cpd('b').fitted() - assert spbn.cpd('c').fitted() - assert spbn.cpd('d').fitted() def test_add_cpds(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) + spbn = SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + [("d", pbn.CKDEType())], + ) - assert spbn.node_type('a') == pbn.UnknownFactorType() - spbn.add_cpds([CKDE('a', [])]) - assert spbn.node_type('a') == pbn.CKDEType() + assert spbn.node_type("a") == pbn.UnknownFactorType() + spbn.add_cpds([CKDE("a", [])]) + assert spbn.node_type("a") == pbn.CKDEType() with pytest.raises(ValueError) as ex: - spbn.add_cpds([LinearGaussianCPD('d', ['a', 'b', 'c'])]) + spbn.add_cpds([LinearGaussianCPD("d", ["a", "b", "c"])]) assert "Bayesian network expects type" in str(ex.value) - lg = LinearGaussianCPD('b', ['a'], [2.5, 1.65], 4) - ckde = CKDE('d', ['a', 'b', 'c']) + lg = LinearGaussianCPD("b", ["a"], [2.5, 1.65], 4) + ckde = CKDE("d", ["a", "b", "c"]) assert lg.fitted() assert not ckde.fitted() spbn.add_cpds([lg, ckde]) - spbn.set_node_type('a', pbn.UnknownFactorType()) + spbn.set_node_type("a", pbn.UnknownFactorType()) with pytest.raises(ValueError) as ex: - not spbn.cpd('a').fitted() - assert "CPD of variable \"a\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + not spbn.cpd("a").fitted() + assert ( + 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) - assert spbn.cpd('b').fitted() + assert spbn.cpd("b").fitted() with pytest.raises(ValueError) as ex: - not spbn.cpd('c').fitted() - assert "CPD of variable \"c\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + not spbn.cpd("c").fitted() + assert ( + 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) + + assert not spbn.cpd("d").fitted() - assert not spbn.cpd('d').fitted() def test_logl(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = SemiparametricBN( + [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + ) spbn.fit(df) @@ -213,7 +258,7 @@ def test_logl(): sum_ll = np.zeros((5000,)) sum_sll = 0 - + for n in spbn.nodes(): cpd = spbn.cpd(n) l = cpd.logl(test_df) @@ -221,7 +266,7 @@ def test_logl(): assert np.all(np.isclose(s, l.sum())) sum_ll += l sum_sll += s - + assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) - assert sll == sum_sll \ No newline at end of file + assert sll == sum_sll diff --git a/tests/serialization/serialize_factor_test.py b/tests/serialization/serialize_factor_test.py index f6e34256..05fa9405 100644 --- a/tests/serialization/serialize_factor_test.py +++ b/tests/serialization/serialize_factor_test.py @@ -5,21 +5,25 @@ from pybnesian import FactorType, Factor, LinearGaussianCPD, CKDE, DiscreteFactor import pickle + @pytest.fixture def lg_bytes(): lg = LinearGaussianCPD("c", ["a", "b"]) return pickle.dumps(lg) + @pytest.fixture def ckde_bytes(): ckde = CKDE("c", ["a", "b"]) return pickle.dumps(ckde) + @pytest.fixture def discrete_bytes(): discrete = DiscreteFactor("c", ["a", "b"]) return pickle.dumps(discrete) + class NewType(FactorType): def __init__(self, factor_class): FactorType.__init__(self) @@ -31,16 +35,17 @@ def new_factor(self, model, variable, evidence): def __str__(self): return "NewType" + class NewFactor(Factor): def __init__(self, variable, evidence): Factor.__init__(self, variable, evidence) self._fitted = False self.some_fit_data = None - + def fit(self, df): self.some_fit_data = "fitted" self._fitted = True - + def fitted(self): return self._fitted @@ -51,12 +56,13 @@ def __str__(self): return "NewFactor" def __getstate_extra__(self): - d = {'fitted': self._fitted, 'some_fit_data': self.some_fit_data} + d = {"fitted": self._fitted, "some_fit_data": self.some_fit_data} return d def __setstate_extra__(self, d): - self._fitted = d['fitted'] - self.some_fit_data = d['some_fit_data'] + self._fitted = d["fitted"] + self.some_fit_data = d["some_fit_data"] + class NewFactorBis(Factor): def __init__(self, variable, evidence): @@ -67,7 +73,7 @@ def __init__(self, variable, evidence): def fit(self, df): self.some_fit_data = "fitted" self._fitted = True - + def fitted(self): return self._fitted @@ -78,28 +84,35 @@ def __str__(self): return "NewFactor" def __getstate__(self): - d = {'variable': self.variable(), - 'evidence': self.evidence(), - 'fitted': self._fitted, - 'some_fit_data': self.some_fit_data} + d = { + "variable": self.variable(), + "evidence": self.evidence(), + "fitted": self._fitted, + "some_fit_data": self.some_fit_data, + } return d def __setstate__(self, d): - Factor.__init__(self, d['variable'], d['evidence']) - self._fitted = d['fitted'] - self.some_fit_data = d['some_fit_data'] + Factor.__init__(self, d["variable"], d["evidence"]) + self._fitted = d["fitted"] + self.some_fit_data = d["some_fit_data"] + @pytest.fixture def new_bytes(): n = NewFactor("c", ["a", "b"]) return pickle.dumps(n) + @pytest.fixture def newbis_bytes(): n = NewFactorBis("c", ["a", "b"]) return pickle.dumps(n) -def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes): + +def test_serialization_unfitted_factor( + lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes +): loaded_lg = pickle.loads(lg_bytes) assert loaded_lg.variable() == "c" assert set(loaded_lg.evidence()) == set(["a", "b"]) @@ -127,6 +140,7 @@ def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new assert loaded_new.type() == nn.type() from pybnesian import GaussianNetwork + dummy_network = GaussianNetwork(["a", "b", "c", "d"]) assert type(loaded_new.type().new_factor(dummy_network, "a", [])) == NewFactor @@ -147,43 +161,61 @@ def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new assert loaded_discrete.type() != loaded_new.type() assert loaded_newbis.type() == loaded_new.type() + @pytest.fixture def lg_fitted_bytes(): lg = LinearGaussianCPD("c", ["a", "b"], [1, 2, 3], 0.5) return pickle.dumps(lg) + @pytest.fixture def ckde_fitted_bytes(): np.random.seed(1) - data = pd.DataFrame({'a': np.random.rand(10), 'b': np.random.rand(10), 'c': np.random.rand(10)}).astype(float) + data = pd.DataFrame( + {"a": np.random.rand(10), "b": np.random.rand(10), "c": np.random.rand(10)} + ).astype(float) ckde = CKDE("c", ["a", "b"]) ckde.fit(data) return pickle.dumps(ckde) + @pytest.fixture def discrete_fitted_bytes(): discrete = DiscreteFactor("c", ["a", "b"]) - data = pd.DataFrame({'a': ["a1", "a2", "a1", "a2", "a2", "a2", "a2", "a2"], - 'b': ["b1", "b1", "b1", "b1", "b1", "b2", "b1", "b2"], - 'c': ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"]}, dtype="category") + data = pd.DataFrame( + { + "a": ["a1", "a2", "a1", "a2", "a2", "a2", "a2", "a2"], + "b": ["b1", "b1", "b1", "b1", "b1", "b2", "b1", "b2"], + "c": ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"], + }, + dtype="category", + ) discrete.fit(data) return pickle.dumps(discrete) + @pytest.fixture def new_fitted_bytes(): n = NewFactor("c", ["a", "b"]) n.fit(None) return pickle.dumps(n) + @pytest.fixture def newbis_fitted_bytes(): n = NewFactorBis("c", ["a", "b"]) n.fit(None) return pickle.dumps(n) -def test_serialization_fitted_factor(lg_fitted_bytes, ckde_fitted_bytes, discrete_fitted_bytes, new_fitted_bytes, - newbis_fitted_bytes): + +def test_serialization_fitted_factor( + lg_fitted_bytes, + ckde_fitted_bytes, + discrete_fitted_bytes, + new_fitted_bytes, + newbis_fitted_bytes, +): loaded_lg = pickle.loads(lg_fitted_bytes) assert loaded_lg.variable() == "c" assert set(loaded_lg.evidence()) == set(["a", "b"]) @@ -199,9 +231,9 @@ def test_serialization_fitted_factor(lg_fitted_bytes, ckde_fitted_bytes, discret assert loaded_ckde.num_instances() == 10 tr = loaded_ckde.kde_joint().dataset().to_pandas() np.random.seed(1) - assert np.all(tr['a'] == np.random.rand(10)) - assert np.all(tr['b'] == np.random.rand(10)) - assert np.all(tr['c'] == np.random.rand(10)) + assert np.all(tr["a"] == np.random.rand(10)) + assert np.all(tr["b"] == np.random.rand(10)) + assert np.all(tr["c"] == np.random.rand(10)) loaded_discrete = pickle.loads(discrete_fitted_bytes) assert loaded_discrete.variable() == "c" @@ -209,9 +241,14 @@ def test_serialization_fitted_factor(lg_fitted_bytes, ckde_fitted_bytes, discret assert loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() - test = pd.DataFrame({'a': ["a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2"], - 'b': ["b1", "b1", "b2", "b2", "b1", "b1", "b2", "b2"], - 'c': ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"]}, dtype="category") + test = pd.DataFrame( + { + "a": ["a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2"], + "b": ["b1", "b1", "b2", "b2", "b1", "b1", "b2", "b2"], + "c": ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"], + }, + dtype="category", + ) ll = loaded_discrete.logl(test) assert list(np.exp(ll)) == [1, 0.5, 0.5, 0, 0, 0.5, 0.5, 1] @@ -239,4 +276,4 @@ def test_serialization_fitted_factor(lg_fitted_bytes, ckde_fitted_bytes, discret assert loaded_lg.type() != loaded_new.type() assert loaded_ckde.type() != loaded_discrete.type() assert loaded_ckde.type() != loaded_new.type() - assert loaded_discrete.type() != loaded_new.type() \ No newline at end of file + assert loaded_discrete.type() != loaded_new.type() diff --git a/tests/serialization/serialize_factor_type_test.py b/tests/serialization/serialize_factor_type_test.py index 67f9480c..d32edd3f 100644 --- a/tests/serialization/serialize_factor_type_test.py +++ b/tests/serialization/serialize_factor_type_test.py @@ -3,40 +3,54 @@ from pybnesian import FactorType import pickle + @pytest.fixture def lg_type_bytes(): lg = pbn.LinearGaussianCPDType() return pickle.dumps(lg) + @pytest.fixture def ckde_type_bytes(): ckde = pbn.CKDEType() return pickle.dumps(ckde) + @pytest.fixture def discrete_type_bytes(): discrete = pbn.DiscreteFactorType() return pickle.dumps(discrete) + class NewType(FactorType): def __init__(self): FactorType.__init__(self) + class OtherType(FactorType): def __init__(self): FactorType.__init__(self) + @pytest.fixture def new_type_bytes(): n = NewType() return pickle.dumps(n) + @pytest.fixture def other_type_bytes(): o = OtherType() return pickle.dumps(o) -def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type_bytes, new_type_bytes, other_type_bytes): + +def test_serialization_factor_type( + lg_type_bytes, + ckde_type_bytes, + discrete_type_bytes, + new_type_bytes, + other_type_bytes, +): loaded_lg = pickle.loads(lg_type_bytes) new_lg = pbn.LinearGaussianCPDType() assert new_lg == loaded_lg @@ -66,4 +80,4 @@ def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type assert new_ckde != new_other assert new_discrete != new_new assert new_discrete != new_other - assert new_new != new_other \ No newline at end of file + assert new_new != new_other diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index a70e2cb3..f2ae24f8 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -1,31 +1,46 @@ import pytest import pyarrow as pa import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork, ConditionalBayesianNetwork, GaussianNetwork,\ - SemiparametricBN, KDENetwork, DiscreteBN, LinearGaussianCPD, CKDE, DiscreteFactor +from pybnesian import ( + BayesianNetworkType, + BayesianNetwork, + ConditionalBayesianNetwork, + GaussianNetwork, + SemiparametricBN, + KDENetwork, + DiscreteBN, + LinearGaussianCPD, + CKDE, + DiscreteFactor, +) import pickle import util_test + @pytest.fixture def gaussian_bytes(): gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) return pickle.dumps(gaussian) + @pytest.fixture def spbn_bytes(): spbn = SemiparametricBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.CKDEType())]) return pickle.dumps(spbn) + @pytest.fixture def kde_bytes(): kde = KDENetwork(["a", "b", "c", "d"], [("a", "b")]) return pickle.dumps(kde) + @pytest.fixture def discrete_bytes(): discrete = DiscreteBN(["a", "b", "c", "d"], [("a", "b")]) return pickle.dumps(discrete) + class MyRestrictedGaussianNetworkType(BayesianNetworkType): def __init__(self): BayesianNetworkType.__init__(self) @@ -48,27 +63,35 @@ def new_cbn(self, nodes, interface_nodes): def __str__(self): return "MyRestrictedGaussianNetworkType" + @pytest.fixture def genericbn_bytes(): - gen = BayesianNetwork(MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], [("a", "b")]) + gen = BayesianNetwork( + MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], [("a", "b")] + ) return pickle.dumps(gen) + class NewBN(BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) + BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) + @pytest.fixture def newbn_bytes(): new = NewBN(["a", "b", "c", "d"], [("a", "b")]) return pickle.dumps(new) + class NonHomogeneousType(BayesianNetworkType): def __init__(self): BayesianNetworkType.__init__(self) - + def is_homogeneous(self): return False @@ -94,12 +117,16 @@ def __init__(self, variables, arcs=None, node_types=None): if node_types is None: BayesianNetwork.__init__(self, NonHomogeneousType(), variables) else: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, node_types) + BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, node_types + ) else: if node_types is None: BayesianNetwork.__init__(self, NonHomogeneousType(), variables, arcs) else: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, arcs, node_types) + BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, arcs, node_types + ) self.extra_info = "extra" @@ -109,14 +136,30 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def otherbn_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = OtherBN( + ["a", "b", "c", "d"], + [("a", "b")], + [ + ("b", pbn.LinearGaussianCPDType()), + ("c", pbn.CKDEType()), + ("d", pbn.DiscreteFactorType()), + ], + ) return pickle.dumps(other) -def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_bytes, genericbn_bytes, newbn_bytes, otherbn_bytes): + +def test_serialization_bn_model( + gaussian_bytes, + spbn_bytes, + kde_bytes, + discrete_bytes, + genericbn_bytes, + newbn_bytes, + otherbn_bytes, +): loaded_g = pickle.loads(gaussian_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_g.arcs() == [("a", "b")] @@ -126,10 +169,12 @@ def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_ assert set(loaded_s.nodes()) == set(["a", "b", "c", "d"]) assert loaded_s.arcs() == [("a", "b")] assert loaded_s.type() == pbn.SemiparametricBNType() - assert loaded_s.node_types() == {'a': pbn.UnknownFactorType(), - 'b': pbn.CKDEType(), - 'c': pbn.UnknownFactorType(), - 'd': pbn.UnknownFactorType()} + assert loaded_s.node_types() == { + "a": pbn.UnknownFactorType(), + "b": pbn.CKDEType(), + "c": pbn.UnknownFactorType(), + "d": pbn.UnknownFactorType(), + } loaded_k = pickle.loads(kde_bytes) assert set(loaded_k.nodes()) == set(["a", "b", "c", "d"]) @@ -155,14 +200,17 @@ def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_ assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_o.arcs() == [("a", "b")] assert loaded_o.type() == NonHomogeneousType() - assert loaded_o.node_types() == {'a': pbn.UnknownFactorType(), - 'b': pbn.LinearGaussianCPDType(), - 'c': pbn.CKDEType(), - 'd': pbn.DiscreteFactorType()} + assert loaded_o.node_types() == { + "a": pbn.UnknownFactorType(), + "b": pbn.LinearGaussianCPDType(), + "c": pbn.CKDEType(), + "d": pbn.DiscreteFactorType(), + } assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type() + @pytest.fixture def gaussian_partial_fit_bytes(): gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) @@ -171,6 +219,7 @@ def gaussian_partial_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def gaussian_fit_bytes(): gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) @@ -182,24 +231,38 @@ def gaussian_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def other_partial_fit_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = OtherBN( + ["a", "b", "c", "d"], + [("a", "b")], + [ + ("b", pbn.LinearGaussianCPDType()), + ("c", pbn.CKDEType()), + ("d", pbn.DiscreteFactorType()), + ], + ) lg = LinearGaussianCPD("b", ["a"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) + @pytest.fixture def other_fit_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = OtherBN( + ["a", "b", "c", "d"], + [("a", "b")], + [ + ("b", pbn.LinearGaussianCPDType()), + ("c", pbn.CKDEType()), + ("d", pbn.DiscreteFactorType()), + ], + ) cpd_a = LinearGaussianCPD("a", [], [0], 0.5) cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) - + df_continuous = util_test.generate_normal_data_indep(100) cpd_c = CKDE("c", []) cpd_c.fit(df_continuous) @@ -208,13 +271,19 @@ def other_fit_bytes(): df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) - + other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) other.include_cpd = True return pickle.dumps(other) -def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, other_partial_fit_bytes, other_fit_bytes): + +def test_serialization_fitted_bn( + gaussian_partial_fit_bytes, + gaussian_fit_bytes, + other_partial_fit_bytes, + other_fit_bytes, +): # #################### # Gaussian partial fit # #################### @@ -249,7 +318,7 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, assert cpd_c.evidence() == [] assert cpd_c.beta == [2] assert cpd_c.variance == 1 - + cpd_d = loaded_fitted.cpd("d") assert cpd_d.variable() == "d" assert cpd_d.evidence() == [] @@ -305,55 +374,79 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, # Conditional BN # ########################## + @pytest.fixture def cond_gaussian_bytes(): gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) return pickle.dumps(gaussian) + @pytest.fixture def cond_spbn_bytes(): - spbn = pbn.ConditionalSemiparametricBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType())]) + spbn = pbn.ConditionalSemiparametricBN( + ["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType())] + ) return pickle.dumps(spbn) + @pytest.fixture def cond_kde_bytes(): kde = pbn.ConditionalKDENetwork(["c", "d"], ["a", "b"], [("a", "c")]) return pickle.dumps(kde) + @pytest.fixture def cond_discrete_bytes(): discrete = pbn.ConditionalDiscreteBN(["c", "d"], ["a", "b"], [("a", "c")]) return pickle.dumps(discrete) + @pytest.fixture def cond_genericbn_bytes(): - gen = ConditionalBayesianNetwork(MyRestrictedGaussianNetworkType(), ["c", "d"], ["a", "b"], [("a", "c")]) + gen = ConditionalBayesianNetwork( + MyRestrictedGaussianNetworkType(), ["c", "d"], ["a", "b"], [("a", "c")] + ) return pickle.dumps(gen) + class ConditionalNewBN(ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface) + ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface, arcs) + ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface, arcs + ) + @pytest.fixture def cond_newbn_bytes(): new = ConditionalNewBN(["c", "d"], ["a", "b"], [("a", "c")]) return pickle.dumps(new) + class ConditionalOtherBN(ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None, node_types=None): if arcs is None: if node_types is None: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface) + ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, node_types) + ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, node_types + ) else: if node_types is None: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, arcs) + ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, arcs + ) else: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, arcs, node_types) + ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, arcs, node_types + ) self.extra_info = "extra" @@ -363,19 +456,33 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def cond_otherbn_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = ConditionalOtherBN( + ["c", "d"], + ["a", "b"], + [("a", "c")], + [ + ("b", pbn.LinearGaussianCPDType()), + ("c", pbn.CKDEType()), + ("d", pbn.DiscreteFactorType()), + ], + ) return pickle.dumps(other) - -def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes, cond_kde_bytes, - cond_discrete_bytes, cond_genericbn_bytes, - cond_newbn_bytes, cond_otherbn_bytes, - newbn_bytes, otherbn_bytes): +def test_serialization_conditional_bn_model( + cond_gaussian_bytes, + cond_spbn_bytes, + cond_kde_bytes, + cond_discrete_bytes, + cond_genericbn_bytes, + cond_newbn_bytes, + cond_otherbn_bytes, + newbn_bytes, + otherbn_bytes, +): loaded_g = pickle.loads(cond_gaussian_bytes) assert set(loaded_g.nodes()) == set(["c", "d"]) assert set(loaded_g.interface_nodes()) == set(["a", "b"]) @@ -387,8 +494,7 @@ def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes assert set(loaded_s.interface_nodes()) == set(["a", "b"]) assert loaded_s.arcs() == [("a", "c")] assert loaded_s.type() == pbn.SemiparametricBNType() - assert loaded_s.node_types() == {'c': pbn.CKDEType(), - 'd': pbn.UnknownFactorType()} + assert loaded_s.node_types() == {"c": pbn.CKDEType(), "d": pbn.UnknownFactorType()} loaded_k = pickle.loads(cond_kde_bytes) assert set(loaded_k.nodes()) == set(["c", "d"]) @@ -419,8 +525,7 @@ def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes assert set(loaded_o.interface_nodes()) == set(["a", "b"]) assert loaded_o.arcs() == [("a", "c")] assert loaded_o.type() == NonHomogeneousType() - assert loaded_o.node_types() == {'c': pbn.CKDEType(), - 'd': pbn.DiscreteFactorType()} + assert loaded_o.node_types() == {"c": pbn.CKDEType(), "d": pbn.DiscreteFactorType()} assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type() @@ -431,6 +536,7 @@ def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes assert loaded_nn.type() == loaded_unconditional_nn.type() assert loaded_o.type() == loaded_unconditional_o.type() + @pytest.fixture def cond_gaussian_partial_fit_bytes(): gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) @@ -439,6 +545,7 @@ def cond_gaussian_partial_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def cond_gaussian_fit_bytes(): gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) @@ -448,22 +555,32 @@ def cond_gaussian_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def cond_other_partial_fit_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), - ("d", pbn.LinearGaussianCPDType())]) + other = ConditionalOtherBN( + ["c", "d"], + ["a", "b"], + [("a", "c")], + [("c", pbn.CKDEType()), ("d", pbn.LinearGaussianCPDType())], + ) lg = LinearGaussianCPD("d", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) + @pytest.fixture def cond_other_fit_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = ConditionalOtherBN( + ["c", "d"], + ["a", "b"], + [("a", "c")], + [("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())], + ) cpd_c = CKDE("c", ["a"]) cpd_d = DiscreteFactor("d", []) - + df_continuous = util_test.generate_normal_data_indep(100) cpd_c.fit(df_continuous) @@ -473,12 +590,17 @@ def cond_other_fit_bytes(): cpd_d.fit(df_discrete) other.add_cpds([cpd_c, cpd_d]) - + other.include_cpd = True return pickle.dumps(other) -def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, cond_gaussian_fit_bytes, - cond_other_partial_fit_bytes, cond_other_fit_bytes): + +def test_serialization_fitted_conditional_bn( + cond_gaussian_partial_fit_bytes, + cond_gaussian_fit_bytes, + cond_other_partial_fit_bytes, + cond_other_fit_bytes, +): # #################### # Gaussian partial fit # #################### @@ -501,7 +623,7 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co assert cpd_c.evidence() == ["a"] assert list(cpd_c.beta) == [1, 2] assert cpd_c.variance == 2 - + cpd_d = loaded_fitted.cpd("d") assert cpd_d.variable() == "d" assert cpd_d.evidence() == [] @@ -541,10 +663,12 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co assert loaded_other_fitted.extra_info == "extra" assert loaded_other.type() == loaded_other_fitted.type() + # ########################## # Dynamic BN # ########################## + @pytest.fixture def dyn_gaussian_bytes(): gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) @@ -552,6 +676,7 @@ def dyn_gaussian_bytes(): gaussian.transition_bn().add_arc("c_t_2", "b_t_0") return pickle.dumps(gaussian) + @pytest.fixture def dyn_spbn_bytes(): spbn = pbn.DynamicSemiparametricBN(["a", "b", "c", "d"], 2) @@ -560,6 +685,7 @@ def dyn_spbn_bytes(): spbn.transition_bn().set_node_type("b_t_0", pbn.CKDEType()) return pickle.dumps(spbn) + @pytest.fixture def dyn_kde_bytes(): kde = pbn.DynamicKDENetwork(["a", "b", "c", "d"], 2) @@ -567,6 +693,7 @@ def dyn_kde_bytes(): kde.transition_bn().add_arc("c_t_2", "b_t_0") return pickle.dumps(kde) + @pytest.fixture def dyn_discrete_bytes(): discrete = pbn.DynamicDiscreteBN(["a", "b", "c", "d"], 2) @@ -574,23 +701,34 @@ def dyn_discrete_bytes(): discrete.transition_bn().add_arc("c_t_2", "b_t_0") return pickle.dumps(discrete) + @pytest.fixture def dyn_genericbn_bytes(): - gen = pbn.DynamicBayesianNetwork(MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], 2) + gen = pbn.DynamicBayesianNetwork( + MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], 2 + ) gen.static_bn().add_arc("a_t_2", "d_t_1") gen.transition_bn().add_arc("a_t_2", "b_t_0") return pickle.dumps(gen) + class DynamicNewBN(pbn.DynamicBayesianNetwork): def __init__(self, variables, markovian_order): - pbn.DynamicBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, markovian_order) + pbn.DynamicBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, markovian_order + ) + class DynamicOtherBN(pbn.DynamicBayesianNetwork): def __init__(self, variables, markovian_order, static_bn=None, transition_bn=None): if static_bn is None or transition_bn is None: - pbn.DynamicBayesianNetwork.__init__(self, NonHomogeneousType(), variables, markovian_order) + pbn.DynamicBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, markovian_order + ) else: - pbn.DynamicBayesianNetwork.__init__(self, variables, markovian_order, static_bn, transition_bn) + pbn.DynamicBayesianNetwork.__init__( + self, variables, markovian_order, static_bn, transition_bn + ) self.extra_info = "extra" def __getstate_extra__(self): @@ -599,6 +737,7 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def dyn_newbn_bytes(): new = DynamicNewBN(["a", "b", "c", "d"], 2) @@ -606,6 +745,7 @@ def dyn_newbn_bytes(): new.transition_bn().add_arc("a_t_2", "b_t_0") return pickle.dumps(new) + @pytest.fixture def dyn_otherbn_bytes(): other = DynamicOtherBN(["a", "b", "c", "d"], 2) @@ -617,8 +757,16 @@ def dyn_otherbn_bytes(): other.transition_bn().set_node_type("d_t_0", pbn.CKDEType()) return pickle.dumps(other) -def test_serialization_dbn_model(dyn_gaussian_bytes, dyn_spbn_bytes, dyn_kde_bytes, dyn_discrete_bytes, - dyn_genericbn_bytes, dyn_newbn_bytes, dyn_otherbn_bytes): + +def test_serialization_dbn_model( + dyn_gaussian_bytes, + dyn_spbn_bytes, + dyn_kde_bytes, + dyn_discrete_bytes, + dyn_genericbn_bytes, + dyn_newbn_bytes, + dyn_otherbn_bytes, +): loaded_g = pickle.loads(dyn_gaussian_bytes) assert set(loaded_g.variables()) == set(["a", "b", "c", "d"]) assert loaded_g.static_bn().arcs() == [("a_t_2", "d_t_1")] @@ -669,6 +817,7 @@ def test_serialization_dbn_model(dyn_gaussian_bytes, dyn_spbn_bytes, dyn_kde_byt assert loaded_other.static_bn().node_type("d_t_1") == pbn.CKDEType() assert loaded_other.transition_bn().node_type("d_t_0") == pbn.CKDEType() + @pytest.fixture def dyn_gaussian_partial_fit_bytes(): gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) @@ -681,6 +830,7 @@ def dyn_gaussian_partial_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def dyn_gaussian_fit_bytes(): gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) @@ -691,24 +841,35 @@ def dyn_gaussian_fit_bytes(): gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def dyn_other_partial_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] - other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType())]) + other_static = OtherBN( + static_nodes, + [("a_t_2", "d_t_1")], + [ + ("b_t_1", pbn.DiscreteFactorType()), + ("c_t_1", pbn.CKDEType()), + ("d_t_1", pbn.LinearGaussianCPDType()), + ], + ) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) - other_transition = ConditionalOtherBN(transition_nodes, - static_nodes, - [("a_t_2", "d_t_0")], - [("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType())]) + other_transition = ConditionalOtherBN( + transition_nodes, + static_nodes, + [("a_t_2", "d_t_0")], + [ + ("b_t_0", pbn.DiscreteFactorType()), + ("c_t_0", pbn.CKDEType()), + ("d_t_0", pbn.LinearGaussianCPDType()), + ], + ) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) @@ -718,25 +879,36 @@ def dyn_other_partial_fit_bytes(): dyn_other.include_cpd = True return pickle.dumps(dyn_other) + @pytest.fixture def dyn_other_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] - other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_2", pbn.DiscreteFactorType()), - ("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType())]) + other_static = OtherBN( + static_nodes, + [("a_t_2", "d_t_1")], + [ + ("b_t_2", pbn.DiscreteFactorType()), + ("b_t_1", pbn.DiscreteFactorType()), + ("c_t_1", pbn.CKDEType()), + ("d_t_1", pbn.LinearGaussianCPDType()), + ], + ) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) - other_transition = ConditionalOtherBN(transition_nodes, - static_nodes, - [("a_t_2", "d_t_0")], - [("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType())]) + other_transition = ConditionalOtherBN( + transition_nodes, + static_nodes, + [("a_t_2", "d_t_0")], + [ + ("b_t_0", pbn.DiscreteFactorType()), + ("c_t_0", pbn.CKDEType()), + ("d_t_0", pbn.LinearGaussianCPDType()), + ], + ) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) @@ -751,8 +923,13 @@ def dyn_other_fit_bytes(): dyn_other.include_cpd = True return pickle.dumps(dyn_other) -def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_fit_bytes, - dyn_other_partial_fit_bytes, dyn_other_fit_bytes): + +def test_serialization_fitted_dbn( + dyn_gaussian_partial_fit_bytes, + dyn_gaussian_fit_bytes, + dyn_other_partial_fit_bytes, + dyn_other_fit_bytes, +): # #################### # Gaussian partial fit # #################### @@ -793,7 +970,9 @@ def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_f assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() - assert loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + assert ( + loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + ) cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" @@ -820,7 +999,9 @@ def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_f assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() - assert loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + assert ( + loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + ) cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" diff --git a/tests/serialization/serialize_models_type_test.py b/tests/serialization/serialize_models_type_test.py index 7f659108..eaedaf25 100644 --- a/tests/serialization/serialize_models_type_test.py +++ b/tests/serialization/serialize_models_type_test.py @@ -3,26 +3,31 @@ import pickle import itertools + @pytest.fixture def gaussian_type_bytes(): g = pbn.GaussianNetworkType() return pickle.dumps(g) + @pytest.fixture def spbn_type_bytes(): s = pbn.SemiparametricBNType() return pickle.dumps(s) + @pytest.fixture def kde_type_bytes(): k = pbn.KDENetworkType() return pickle.dumps(k) + @pytest.fixture def discrete_type_bytes(): d = pbn.DiscreteBNType() return pickle.dumps(d) + class NewBNType(pbn.BayesianNetworkType): def __init__(self): pbn.BayesianNetworkType.__init__(self) @@ -30,11 +35,13 @@ def __init__(self): def __str__(self): return "NewType" + @pytest.fixture def new_type_bytes(): nn = NewBNType() return pickle.dumps(nn) + class OtherBNType(pbn.BayesianNetworkType): def __init__(self): pbn.BayesianNetworkType.__init__(self) @@ -45,18 +52,25 @@ def __str__(self): def __getstate_extra__(self): return self.some_useful_info - + def __setstate_extra__(self, extra): self.some_useful_info = extra + @pytest.fixture def other_type_bytes(): o = OtherBNType() return pickle.dumps(o) -def test_serialization_bn_type(gaussian_type_bytes, spbn_type_bytes, kde_type_bytes, - discrete_type_bytes, new_type_bytes, other_type_bytes): +def test_serialization_bn_type( + gaussian_type_bytes, + spbn_type_bytes, + kde_type_bytes, + discrete_type_bytes, + new_type_bytes, + other_type_bytes, +): loaded_g = pickle.loads(gaussian_type_bytes) new_g = pbn.GaussianNetworkType() assert loaded_g == new_g @@ -85,4 +99,4 @@ def test_serialization_bn_type(gaussian_type_bytes, spbn_type_bytes, kde_type_by m = [loaded_g, loaded_s, loaded_k, loaded_d, loaded_nn, loaded_o] for t in itertools.combinations(m, 2): - assert t[0] != t[1] \ No newline at end of file + assert t[0] != t[1] From d25d935170d39fc8347f26d77b48c50152303256 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 10 Sep 2024 23:11:41 +0200 Subject: [PATCH 04/75] python imports sorted --- conv_template.py | 5 ++- pybnesian/kde/opencl_kernels/KDE.cl.src | 3 +- tests/dataset/crossvalidation_test.py | 4 +-- tests/dataset/holdout_test.py | 4 +-- .../continuous/LinearGaussianCPD_test.py | 6 ++-- tests/factors/discrete/DiscreteFactor_test.py | 7 ++-- tests/factors/factor_type_test.py | 5 +-- tests/learning/algorithms/constraint_test.py | 2 +- tests/learning/operators/operatorpool_test.py | 7 ++-- tests/learning/operators/operators_test.py | 1 + tests/learning/operators/operatorset_test.py | 5 +-- tests/learning/parameters/mle_test.py | 8 +++-- tests/learning/scores/bic_test.py | 3 +- tests/models/BayesianNetwork_test.py | 21 ++++++------ tests/models/BayesianNetwork_type_test.py | 9 +++--- tests/models/DynamicBayesianNetwork_test.py | 32 +++++++++++-------- tests/models/HeterogeneousBN_test.py | 3 +- tests/models/SemiparametricBN_test.py | 17 +++++----- .../serialize_factor_type_test.py | 4 ++- tests/serialization/serialize_models_test.py | 18 ++++++----- .../serialize_models_type_test.py | 6 ++-- 21 files changed, 99 insertions(+), 71 deletions(-) diff --git a/conv_template.py b/conv_template.py index baf0371e..c911c64f 100644 --- a/conv_template.py +++ b/conv_template.py @@ -4,6 +4,9 @@ # This code is extracted from numpy distutils. # # This code has been extracted to avoid loading numpy.distutils +# +# It is used to generate the C source files from a template. +# PyBNesian uses this code to generate OpenCL code for float and double variables so that there is no dependency on numpy and the OpenCL code. ############################################################### """ @@ -65,8 +68,8 @@ __all__ = ["process_str", "process_file"] import os -import sys import re +import sys # names for replacement that are already global. global_names = {} diff --git a/pybnesian/kde/opencl_kernels/KDE.cl.src b/pybnesian/kde/opencl_kernels/KDE.cl.src index d41098b0..259eb474 100644 --- a/pybnesian/kde/opencl_kernels/KDE.cl.src +++ b/pybnesian/kde/opencl_kernels/KDE.cl.src @@ -1,4 +1,5 @@ -/* This code assumes column major data for matrices. */ +/* This file contains opencl code for matrix operations. +It assumes column major data for matrices. */ #define IDX(i, j, rows) (i) + ((j)*(rows)) #define ROW(idx, rows) (idx) % (rows) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 4f58420b..3ed8faf9 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,8 +1,8 @@ import numpy as np -import pybnesian as pbn - import util_test +import pybnesian as pbn + SIZE = 10000 df = util_test.generate_normal_data(SIZE) diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 46a10fb6..c835ae91 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,9 +1,9 @@ import numpy as np import pandas as pd -import pybnesian as pbn - import util_test +import pybnesian as pbn + SIZE = 10000 df = util_test.generate_normal_data(SIZE) diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index df42820b..1d4320c4 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -1,12 +1,10 @@ import numpy as np import pandas as pd import pyarrow as pa -import pybnesian as pbn - -import pytest +import util_test from scipy.stats import norm -import util_test +import pybnesian as pbn SIZE = 10000 diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 0346d008..6f9ccc27 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -1,10 +1,11 @@ -import pytest import numpy as np import pandas as pd import pyarrow as pa -import pybnesian as pbn +import pytest import util_test +import pybnesian as pbn + df = util_test.generate_discrete_data_dependent(10000) @@ -12,7 +13,7 @@ def test_data_type(): a = pbn.DiscreteFactor("A", []) with pytest.raises(ValueError) as ex: a.data_type() - "DiscreteFactor factor not fitted." in str(ex.value) + assert "DiscreteFactor factor not fitted." in str(ex.value) categories = np.asarray(["a1", "a2"]) a_values = pd.Categorical( diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index 47e7be73..a52f678a 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -1,6 +1,7 @@ import pytest + import pybnesian as pbn -from pybnesian import FactorType, Factor +from pybnesian import Factor, FactorType def test_factor_type(): @@ -91,7 +92,7 @@ def type(self): dummy_network = pbn.GaussianNetwork(["a", "b", "c", "d"]) with pytest.raises(RuntimeError) as ex: - f4 = f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) + f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( ex.value ) diff --git a/tests/learning/algorithms/constraint_test.py b/tests/learning/algorithms/constraint_test.py index 4f22a497..34398c5a 100644 --- a/tests/learning/algorithms/constraint_test.py +++ b/tests/learning/algorithms/constraint_test.py @@ -1,4 +1,4 @@ -from pybnesian import PartiallyDirectedGraph, MeekRules +from pybnesian import MeekRules, PartiallyDirectedGraph def test_meek_rule1(): diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index e167e1d8..2b2b0709 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,7 +1,8 @@ import pytest -import pybnesian as pbn import util_test +import pybnesian as pbn + SIZE = 10000 df = util_test.generate_normal_data(SIZE) @@ -10,9 +11,11 @@ def test_create(): arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() pool = pbn.OperatorPool([arcs, node_type]) + # Checks if pool is created + assert pool is not None with pytest.raises(ValueError) as ex: - pool = pbn.OperatorPool([]) + pbn.OperatorPool([]) assert "cannot be empty" in str(ex.value) diff --git a/tests/learning/operators/operators_test.py b/tests/learning/operators/operators_test.py index b0f7a070..bb6e8919 100644 --- a/tests/learning/operators/operators_test.py +++ b/tests/learning/operators/operators_test.py @@ -1,4 +1,5 @@ import pytest + import pybnesian as pbn diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index a1ef7d6c..13ecfa66 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,8 +1,9 @@ -import pytest import numpy as np -import pybnesian as pbn +import pytest import util_test +import pybnesian as pbn + SIZE = 10000 df = util_test.generate_normal_data(SIZE) diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 3cbe6b84..82985ee0 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,8 +1,9 @@ -import pytest import numpy as np -import pybnesian as pbn +import pytest import util_test +import pybnesian as pbn + SIZE = 10000 df = util_test.generate_normal_data(SIZE) @@ -30,10 +31,11 @@ def numpy_fit_mle_lg(data, variable, evidence): def test_mle_create(): with pytest.raises(ValueError) as ex: - mle = pbn.MLE(pbn.CKDEType()) + pbn.MLE(pbn.CKDEType()) assert "MLE not available" in str(ex.value) mle = pbn.MLE(pbn.LinearGaussianCPDType()) + assert mle is not None def test_mle_lg(): diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 0030ffea..7583232b 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,7 +1,8 @@ import numpy as np +import util_test from scipy.stats import norm + import pybnesian as pbn -import util_test SIZE = 10000 diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index 09ff47d5..e5a6de0f 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,8 +1,9 @@ -import pytest import numpy as np +import pytest +import util_test + import pybnesian as pbn from pybnesian import BayesianNetwork, GaussianNetwork -import util_test df = util_test.generate_normal_data(10000) @@ -296,21 +297,21 @@ def test_add_cpds(): assert cpd_b.variance == 4 with pytest.raises(ValueError) as ex: - cpd_a = gbn.cpd("a") + gbn.cpd("a") assert ( 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) with pytest.raises(ValueError) as ex: - cpd_c = gbn.cpd("c") + gbn.cpd("c") assert ( 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) with pytest.raises(ValueError) as ex: - cpd_d = gbn.cpd("d") + gbn.cpd("d") assert ( 'CPD of variable "d" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) @@ -337,11 +338,11 @@ def test_bn_logl(): for n in gbn.nodes(): cpd = gbn.cpd(n) - l = cpd.logl(test_df) - s = cpd.slogl(test_df) - assert np.all(np.isclose(s, l.sum())) - sum_ll += l - sum_sll += s + log_likelihood = cpd.logl(test_df) + sum_log_likelihood = cpd.slogl(test_df) + assert np.all(np.isclose(sum_log_likelihood, log_likelihood.sum())) + sum_ll += log_likelihood + sum_sll += sum_log_likelihood assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 2ff8f4df..68e27ade 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,14 +1,15 @@ +import util_test + import pybnesian as pbn from pybnesian import ( - BayesianNetworkType, BayesianNetwork, + BayesianNetworkType, ConditionalBayesianNetwork, + DiscreteBN, GaussianNetwork, - SemiparametricBN, KDENetwork, - DiscreteBN, + SemiparametricBN, ) -import util_test def test_bn_type(): diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index db7677c5..05304f59 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -1,15 +1,17 @@ -import pytest import re + import numpy as np import pandas as pd +import pytest +import util_test from scipy.stats import norm + import pybnesian as pbn from pybnesian import ( - GaussianNetwork, ConditionalGaussianNetwork, DynamicGaussianNetwork, + GaussianNetwork, ) -import util_test df = util_test.generate_normal_data(1000) @@ -34,20 +36,22 @@ def test_create_dbn(): transition_bn = ConditionalGaussianNetwork(transition_nodes, static_nodes) gbn2 = DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) + assert gbn2.markovian_order() == 2 + assert gbn2.variables() == ["a", "b", "c", "d"] + assert gbn2.num_variables() == 4 + assert gbn2.type() == pbn.GaussianNetworkType() wrong_transition_bn = pbn.ConditionalDiscreteBN(transition_nodes, static_nodes) with pytest.raises(ValueError) as ex: - gbn3 = DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) + DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) assert "Static and transition Bayesian networks do not have the same type" in str( ex.value ) wrong_static_bn = pbn.DiscreteBN(static_nodes) with pytest.raises(ValueError) as ex: - gbn4 = DynamicGaussianNetwork( - variables, 2, wrong_static_bn, wrong_transition_bn - ) + DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) assert "Bayesian networks are not Gaussian." in str(ex.value) @@ -123,10 +127,11 @@ def static_logl(dbn, test_data, index, variable): row_values = [sl.loc[index, variable]] for e in evidence: m = re.search("(.*)_t_(\\d+)", e) - e_var = m[1] - t = int(m[2]) + if m: + e_var = m.group(1) + t = int(m.group(2)) - row_values.append(sl.loc[dbn.markovian_order() - t, e_var]) + row_values.append(sl.loc[dbn.markovian_order() - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) @@ -141,10 +146,11 @@ def transition_logl(dbn, test_data, index, variable): row_values = [test_data.loc[index, variable]] for e in evidence: m = re.search("(.*)_t_(\\d+)", e) - e_var = m[1] - t = int(m[2]) + if m: + e_var = m.group(1) + t = int(m.group(2)) - row_values.append(test_data.loc[index - t, e_var]) + row_values.append(test_data.loc[index - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) return lg_logl_row(r, node_name, evidence, cpd.beta, cpd.variance) diff --git a/tests/models/HeterogeneousBN_test.py b/tests/models/HeterogeneousBN_test.py index 227508f0..0bd7127b 100644 --- a/tests/models/HeterogeneousBN_test.py +++ b/tests/models/HeterogeneousBN_test.py @@ -1,6 +1,7 @@ -import pybnesian as pbn import pyarrow as pa +import pybnesian as pbn + def test_type_equality(): # diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 91b634d5..eb977601 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,9 +1,10 @@ -import pytest import numpy as np -import pybnesian as pbn -from pybnesian import SemiparametricBN, LinearGaussianCPD, CKDE +import pytest import util_test +import pybnesian as pbn +from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN + df = util_test.generate_normal_data(10000) @@ -261,11 +262,11 @@ def test_logl(): for n in spbn.nodes(): cpd = spbn.cpd(n) - l = cpd.logl(test_df) - s = cpd.slogl(test_df) - assert np.all(np.isclose(s, l.sum())) - sum_ll += l - sum_sll += s + log_likelihood = cpd.logl(test_df) + sum_log_likelihood = cpd.slogl(test_df) + assert np.all(np.isclose(sum_log_likelihood, log_likelihood.sum())) + sum_ll += log_likelihood + sum_sll += sum_log_likelihood assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) diff --git a/tests/serialization/serialize_factor_type_test.py b/tests/serialization/serialize_factor_type_test.py index d32edd3f..12777d04 100644 --- a/tests/serialization/serialize_factor_type_test.py +++ b/tests/serialization/serialize_factor_type_test.py @@ -1,7 +1,9 @@ +import pickle + import pytest + import pybnesian as pbn from pybnesian import FactorType -import pickle @pytest.fixture diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index f2ae24f8..8a882549 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -1,20 +1,22 @@ -import pytest +import pickle + import pyarrow as pa +import pytest +import util_test + import pybnesian as pbn from pybnesian import ( - BayesianNetworkType, + CKDE, BayesianNetwork, + BayesianNetworkType, ConditionalBayesianNetwork, + DiscreteBN, + DiscreteFactor, GaussianNetwork, - SemiparametricBN, KDENetwork, - DiscreteBN, LinearGaussianCPD, - CKDE, - DiscreteFactor, + SemiparametricBN, ) -import pickle -import util_test @pytest.fixture diff --git a/tests/serialization/serialize_models_type_test.py b/tests/serialization/serialize_models_type_test.py index eaedaf25..3c2d1ca9 100644 --- a/tests/serialization/serialize_models_type_test.py +++ b/tests/serialization/serialize_models_type_test.py @@ -1,7 +1,9 @@ +import itertools +import pickle + import pytest + import pybnesian as pbn -import pickle -import itertools @pytest.fixture From 10265b3e6bf42ec2073607f88380cc1ac27cd98c Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 09:15:04 +0200 Subject: [PATCH 05/75] extra files removed --- INSTALL_README.md | 71 ----------------------------------------------- reinstall.sh | 15 ---------- 2 files changed, 86 deletions(-) delete mode 100644 INSTALL_README.md delete mode 100644 reinstall.sh diff --git a/INSTALL_README.md b/INSTALL_README.md deleted file mode 100644 index 77ddf858..00000000 --- a/INSTALL_README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Installing PyBnesian -This is a more extensive guide on how to install PyBNesian on an Ubuntu computer and minimise error in the installation. - - -## Requirements of the Ubuntu computer for instaling PyBnesian -Before diving into installing Python packages, we need to ensure that certain C++ and OpenCL related packages are installed. We can do that with the following commands: - - sudo apt install cmake - sudo apt install g++ - sudo apt install opencl-headers - sudo apt install ocl-icd-opencl-dev - - -## Configure an environemnt -Set up a virtual environment for Python 3.10. Conda is recommended and all the details about its intallation can be found at the [Anaconda website](https://www.anaconda.com/download). - -**Important:** Once it is installed, execute the rest of the commands within your environment. You can access it via your terminal with command. - - conda activate myenv - -When done correctly, you will read *(myenv)* right before your terminal prompt. - -### Install pyarrow version 12.0.1. Posterior versions seem to cause trouble -Install pyarrow version 12.0.1 before installing PyBnesian. If not, PyBnesian will install later versions, which may cause trouble. In particular, we found that to be the cause of the error *undefined symbol _ZNK5arrow6Status8ToStringEv* - - pip install pyarrow==12.0.1 - - -## Install PyBnesian -This could be done either with pip or from source. The first option is simpler, while we recommend the second option since the library might still need some small tweaks and this allows for recompiling it - -### Option 1: Installing PyBnesian from pip -Run the command: - - pip install pybnesian - - -### Option 2: Installing pybnesian from source -First of all, clone the PyBnesian repo into your computer running the command: - - git clone https://github.com/davenza/PyBNesian - -For the latest ongoing dev changes, you might consider alternatively cloning this fork: - - git clone https://github.com/carloslihu/PyBNesian.git - -Enter into the newly created folder and run the installation file. - - cd PyBnesian - python setup.py install - -If you want to make changes to the library, you can do so by cleaning and recompiling it running the following commands: - python setup.py clean --all - python setup.py install - -Additionally, to accelerate the building process, you may use ccache. -- To install it, run the following command: - sudo apt install ccache -- To use it, run the following command: - export CC="ccache gcc" - -## Install pandas -A bug prevents PyBnesian from being imported if pandas is not installed and imported previously. As such make sure to install pandas running the following command and to import it in your project before importing PyBnesian. - - -## Useful links -Links that might be of help when dealing with PyBnesian installation. -1. [https://www.sasview.org/docs/old_docs/4.1.2/user/opencl_installation.html](https://www.sasview.org/docs/old_docs/4.1.2/user/opencl_installation.html) - -## Acknowledgements -Thanks to David Atienza for developing PyBnesian in the first place. Also, Carlos Li-Hu assisted to find the commands for solving OpenCL headers related issues. diff --git a/reinstall.sh b/reinstall.sh deleted file mode 100644 index dde6c84a..00000000 --- a/reinstall.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env -# workon aiTanium-master # This doesn't work -export CC="ccache gcc" -pip uninstall pybnesian -y -# python setup.py clean --all -# rm -rf build/temp.linux-x86_64-cpython-310/pybnesian/ -rm -rf build/lib.linux-x86_64-cpython-310 - - -time python setup.py install -# python setup.py develop # For verbose output -# export CC="ccache clang-14" - -# export LDSHARED="clang-14 -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -Wl,-z,relro -g -fwrapv -O2" # NOTE: Ignored? -# source venv/bin/activate \ No newline at end of file From a61d2d2ce1df8cb2bed7a7e33e255aa8f6e6791e Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 11:35:08 +0200 Subject: [PATCH 06/75] Pytests improved and partially commented --- tests/factors/continuous/CKDE_test.py | 13 +- tests/factors/continuous/KDE_test.py | 135 +++++++++++++----- tests/factors/continuous/ProductKDE_test.py | 53 +++++-- .../learning/algorithms/hillclimbing_test.py | 57 +++++++- tests/learning/scores/cvlikelihood_test.py | 35 +++-- .../learning/scores/holdoutlikelihood_test.py | 58 ++++---- tests/models/SemiparametricBN_test.py | 4 +- tests/serialization/serialize_factor_test.py | 10 +- 8 files changed, 262 insertions(+), 103 deletions(-) diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index f4d3c060..fdbc6c30 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -1,14 +1,13 @@ -import pytest import numpy as np -import pyarrow as pa import pandas as pd -import pybnesian as pbn +import pyarrow as pa +import pytest +import util_test from scipy.stats import gaussian_kde -from scipy.stats import norm from scipy.stats import multivariate_normal as mvn -from scipy.special import logsumexp +from scipy.stats import norm -import util_test +import pybnesian as pbn SIZE = 10000 SMALL_SIZE = 10 @@ -46,7 +45,7 @@ def test_kde_data_type(): with pytest.raises(ValueError) as ex: k.data_type() - "CKDE factor not fitted" in str(ex.value) + assert "CKDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index 6e60c813..dd5d9b9f 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -1,14 +1,14 @@ -import pytest import numpy as np import pyarrow as pa -import pybnesian as pbn -from pybnesian import BandwidthSelector +import pytest from scipy.stats import gaussian_kde +from util_test import generate_normal_data -import util_test +import pybnesian as pbn +from pybnesian import BandwidthSelector SIZE = 500 -df = util_test.generate_normal_data(SIZE, seed=0) +df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") @@ -111,7 +111,7 @@ def test_kde_data_type(): with pytest.raises(ValueError) as ex: k.data_type() - "KDE factor not fitted" in str(ex.value) + assert "KDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() @@ -195,28 +195,52 @@ def _test_kde_fit_null_iter(variables, _df, instances): def test_kde_logl(): + """Tests the logl() method of the KDE factor. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_logl_iter(variables, _df, _test_df): - cpd = pbn.KDE(variables) + """Tests that the logl() method of the KDE factor returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + npdata = _df.loc[:, variables].to_numpy() + cpd = pbn.KDE( + variables, + # bandwidth_selector=pbn.ScottsBandwidth(), + bandwidth_selector=pbn.NormalReferenceRule(), + ) cpd.fit(_df) - npdata = _df.loc[:, variables].to_numpy() scipy_kde = gaussian_kde( - npdata.T, + dataset=npdata.T, + # bw_method="scott", bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) - * s.scotts_factor(), + * s.scotts_factor(), # Normal Reference Rule multiplies Scott's factor and then standard deviation ) + # TODO: Add tests to check this + # NOTE + # scipy_kde.factor == scipy_kde.covariance_factor() <-- coefficient (kde.factor) that squared, multiplies the data covariance matrix to obtain the kernel covariance matrix. + # scipy_kde.covariance == scipy_kde.factor ** 2 * npdata.var() + # scipy_kde.inv_cov == 1 / scipy_kde.covariance + # We check that the bandwidth is the same + # TODO: Add tests to check "scott" bandwidth selectors + assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)) + test_npdata = _test_df.loc[:, variables].to_numpy() logl = cpd.logl(_test_df) - scipy = scipy_kde.logpdf(test_npdata.T) + scipy_logl = scipy_kde.logpdf(test_npdata.T) if np.all(_df.dtypes == "float32"): - assert np.all(np.isclose(logl, scipy, atol=0.0005)) + assert np.all(np.isclose(logl, scipy_logl, atol=0.0005)) else: - assert np.all(np.isclose(logl, scipy)) + assert np.all(np.isclose(logl, scipy_logl)) - test_df = util_test.generate_normal_data(50, seed=1) + test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: @@ -241,7 +265,17 @@ def _test_kde_logl_iter(variables, _df, _test_df): def test_kde_logl_null(): + """Tests the logl() method of the KDE factor with null values. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_logl_null_iter(variables, _df, _test_df): + """Tests that the logl() method of the KDE factor with null values returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) @@ -251,23 +285,36 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor(), ) + # We initialize the logl and scipy_logl columns with NaN + _test_df.loc["logl"] = np.nan + _test_df["scipy_logl"] = np.nan - test_npdata = _test_df.loc[:, variables].to_numpy() + # We calculate the logl with the KDE factor + _test_df["logl"] = cpd.logl(_test_df) - logl = cpd.logl(_test_df) - - scipy_result = np.full((test_npdata.shape[0],), np.nan) - nan_rows = np.any(np.isnan(test_npdata), axis=1) - scipy_result[~nan_rows] = scipy_kde.logpdf(test_npdata[~nan_rows].T) + # We calculate the logl with scipy (we have to avoid NaN values) + non_nan_index = _test_df[variables].notna().all(1) + _test_df.loc[non_nan_index, "scipy_logl"] = scipy_kde.logpdf( + _test_df.loc[non_nan_index, variables].T.to_numpy() + ) if npdata.dtype == "float32": - assert np.all(np.isclose(logl, scipy_result, atol=0.0005, equal_nan=True)) + assert np.all( + np.isclose( + _test_df["logl"], + _test_df["scipy_logl"], + atol=0.0005, + equal_nan=True, + ) + ) else: - assert np.all(np.isclose(logl, scipy_result, equal_nan=True)) + assert np.all( + np.isclose(_test_df["logl"], _test_df["scipy_logl"], equal_nan=True) + ) TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) @@ -315,7 +362,17 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): def test_kde_slogl(): + """Tests the slogl() method of the KDE factor. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_slogl_iter(variables, _df, _test_df): + """Tests that the logl() method of the KDE factor returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) @@ -331,7 +388,7 @@ def _test_kde_slogl_iter(variables, _df, _test_df): np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata.T).sum()) ) - test_df = util_test.generate_normal_data(50, seed=1) + test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: @@ -356,7 +413,17 @@ def _test_kde_slogl_iter(variables, _df, _test_df): def test_kde_slogl_null(): + """Tests the slogl() method of the KDE factor with null values. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_slogl_null_iter(variables, _df, _test_df): + """Tests that the slogl() method of the KDE factor with null values returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) @@ -366,20 +433,20 @@ def _test_kde_slogl_null_iter(variables, _df, _test_df): bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor(), ) + # We initialize the logl and scipy_logl columns with NaN + _test_df["scipy_logl"] = np.nan + slogl = cpd.slogl(_test_df) + # We calculate the logl with scipy (we have to avoid NaN values) + non_nan_index = _test_df[variables].notna().all(1) + scipy_slogl = scipy_kde.logpdf( + _test_df.loc[non_nan_index, variables].T.to_numpy() + ).sum() - test_npdata = _test_df.loc[:, variables].to_numpy() - - nan_rows = np.any(np.isnan(test_npdata), axis=1) - - assert np.all( - np.isclose( - cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata[~nan_rows].T).sum() - ) - ) + assert np.all(np.isclose(slogl, scipy_slogl)) TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index d76ff970..30998342 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -1,12 +1,11 @@ -import pytest import numpy as np import pyarrow as pa -import pybnesian as pbn -from pybnesian import BandwidthSelector +import pytest +import util_test from scipy.stats import gaussian_kde -from functools import reduce -import util_test +import pybnesian as pbn +from pybnesian import BandwidthSelector SIZE = 500 df = util_test.generate_normal_data(SIZE, seed=0) @@ -135,7 +134,7 @@ def test_productkde_data_type(): with pytest.raises(ValueError) as ex: k.data_type() - "KDE factor not fitted" in str(ex.value) + assert "KDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() @@ -255,7 +254,17 @@ def factor_product_kernel(train_data): def test_productkde_logl(): + """Tests the logl() method of the ProductKDE factor. It compares the results with the ones obtained with scipy's gaussian_kde. + Both for float64 and float32 data types.""" + def _test_productkde_logl_iter(variables, _df, _test_df): + """Tests that the logl() method of the ProductKDE factor returns the same results as scipy's gaussian_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.ProductKDE(variables) cpd.fit(_df) @@ -307,8 +316,18 @@ def _test_productkde_logl_iter(variables, _df, _test_df): def test_productkde_logl_null(): + """Tests the logl() method of the ProductKDE factor with null values. It compares the results with the ones obtained with scipy's gaussian_kde. + Both for float64 and float32 data types.""" + def _test_productkde_logl_null_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the logl() method of the ProductKDE factor with null values returns the same results as scipy's gaussian_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) logl = cpd.logl(_test_df) @@ -386,8 +405,17 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): def test_productkde_slogl(): + """Tests the slogl() method of the ProductKDE factor. It compares the results with the ones obtained with scipy's gaussian_kde.""" + def _test_productkde_slogl_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the slogl() method of the ProductKDE factor returns the same results as scipy's gaussian_kde. + + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() @@ -445,7 +473,14 @@ def _test_productkde_slogl_iter(variables, _df, _test_df): def test_productkde_slogl_null(): def _test_productkde_slogl_null_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the slogl() method of the ProductKDE factor with null values returns the same results as scipy's gaussian_kde. + + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 8764d5bb..b62dc881 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,9 +1,12 @@ import numpy as np -import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork import util_test +import pybnesian as pbn +from pybnesian import BayesianNetwork, BayesianNetworkType + df = util_test.generate_normal_data(1000) +# TODO: Add tests for normal data with dependencies +# dep_df = util_test.generate_normal_data_dep(1000) def test_hc_estimate(): @@ -207,6 +210,56 @@ def test_hc_shortcut_function(): assert type(model) == NewBN +# # NOTE: Deprecated test for PyBNesian with full covariance matrices +# def test_hc_arc_singular_covariance(): +# """Function to test if with the GBN, KDE and SPBN, the HC algorithm raises an exception when the covariance matrix is singular. Then we check if the learnt model is valid.""" +# column_names = list(dep_df.columns.values) +# # GBN +# gbn = pbn.GaussianNetwork(nodes=column_names) +# gbn = pbn.hc( +# dep_df, +# start=gbn, +# max_iters=int(1e4), +# verbose=True, +# ) +# gbn.fit(dep_df) +# assert gbn.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(gbn.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{gbn.cpd(c)}") + +# # KDE +# kde = pbn.KDENetwork(nodes=column_names) +# kde = pbn.hc( +# dep_df, +# start=kde, +# max_iters=int(1e4), +# verbose=True, +# ) +# kde.fit(dep_df) +# assert kde.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(kde.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{kde.cpd(c)}") + +# # SPBN +# spbn = pbn.SemiparametricBN(nodes=column_names) +# spbn = pbn.hc( +# dep_df, +# start=spbn, +# max_iters=int(1e4), +# verbose=True, +# ) +# spbn.fit(dep_df) +# assert spbn.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(spbn.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{spbn.cpd(c)}") + + +# TODO: Test for when one variable has 0 variance in k-fold cross-validation for CKDEType + + class MyRestrictedGaussianNetworkType(BayesianNetworkType): def __init__(self): BayesianNetworkType.__init__(self) diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index 5a012cb6..0bdc2c72 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -1,33 +1,29 @@ -import pytest import numpy as np -from scipy.stats import norm, gaussian_kde +import pandas as pd +import pytest +from scipy.stats import gaussian_kde, norm +from util_test import generate_normal_data + import pybnesian as pbn -import util_test SIZE = 1000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) seed = 0 -def numpy_local_score(node_type, data, variable, evidence): +def numpy_local_score( + node_type: pbn.FactorType, data: pd.DataFrame, variable: str, evidence: list[str] +): cv = pbn.CrossValidation(data, 10, seed) loglik = 0 for train_df, test_df in cv: - if isinstance(variable, str): - node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() - variable_data = node_data.loc[:, variable] - evidence_data = node_data.loc[:, evidence] - test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.loc[:, variable] - test_evidence_data = test_node_data.loc[:, evidence] - else: - node_data = train_df.to_pandas().iloc[:, [variable] + evidence].dropna() - variable_data = node_data.iloc[:, 0] - evidence_data = node_data.iloc[:, 1:] - test_node_data = test_df.to_pandas().iloc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.iloc[:, 0] - test_evidence_data = test_node_data.iloc[:, 1:] + node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() + variable_data = node_data.loc[:, variable] + evidence_data = node_data.loc[:, evidence] + test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() + test_variable_data = test_node_data.loc[:, variable] + test_evidence_data = test_node_data.loc[:, evidence] if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] @@ -40,6 +36,7 @@ def numpy_local_score(node_type, data, variable, evidence): means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) loglik += norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + elif node_type == pbn.CKDEType(): k_joint = gaussian_kde( node_data.to_numpy().T, diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 5a2e051e..9d7c5227 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -1,30 +1,32 @@ -import pytest import numpy as np +import pandas as pd +import pytest from scipy.stats import gaussian_kde, norm +from util_test import generate_normal_data + import pybnesian as pbn -import util_test SIZE = 1000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) seed = 0 -def numpy_local_score(node_type, training_data, test_data, variable, evidence): - if isinstance(variable, str): - node_data = training_data.loc[:, [variable] + evidence].dropna() - variable_data = node_data.loc[:, variable] - evidence_data = node_data.loc[:, evidence] - test_node_data = test_data.loc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.loc[:, variable] - test_evidence_data = test_node_data.loc[:, evidence] - else: - node_data = training_data.iloc[:, [variable] + evidence].dropna() - variable_data = node_data.iloc[:, 0] - evidence_data = node_data.iloc[:, 1:] - test_node_data = test_data.iloc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.iloc[:, 0] - test_evidence_data = test_node_data.iloc[:, 1:] +def numpy_local_score( + node_type: pbn.FactorType, + training_data: pd.DataFrame, + test_data: pd.DataFrame, + variable: str, + evidence: list[str], +): + node_data = training_data.loc[:, [variable] + evidence].dropna() + variable_data = node_data.loc[:, variable] + evidence_data = node_data.loc[:, evidence] + test_node_data = test_data.loc[:, [variable] + evidence].dropna() + test_variable_data = test_node_data.loc[:, variable] + test_evidence_data = test_node_data.loc[:, evidence] + + loglik = 0 if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] d = evidence_data.shape[1] @@ -35,7 +37,8 @@ def numpy_local_score(node_type, training_data, test_data, variable, evidence): var = res / (N - d - 1) means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) - return norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + loglik = norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + elif node_type == pbn.CKDEType(): k_joint = gaussian_kde( node_data.to_numpy().T, @@ -46,35 +49,38 @@ def numpy_local_score(node_type, training_data, test_data, variable, evidence): k_marg = gaussian_kde( evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor() ) - return np.sum( + loglik = np.sum( k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T) ) else: - return np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) + loglik = np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) + + return loglik def test_holdout_create(): + """Test HoldoutLikelihood creation with different parameters""" s = pbn.HoldoutLikelihood(df) assert s.training_data().num_rows == 0.8 * SIZE assert s.test_data().num_rows == 0.2 * SIZE - s = pbn.HoldoutLikelihood(df, 0.5) + s = pbn.HoldoutLikelihood(df, test_ratio=0.5) assert s.training_data().num_rows == 0.5 * SIZE assert s.test_data().num_rows == 0.5 * SIZE - s = pbn.HoldoutLikelihood(df, 0.2, 0) - s2 = pbn.HoldoutLikelihood(df, 0.2, 0) + s = pbn.HoldoutLikelihood(df, test_ratio=0.2, seed=0) + s2 = pbn.HoldoutLikelihood(df, test_ratio=0.2, seed=0) assert s.training_data().equals(s2.training_data()) assert s.test_data().equals(s2.test_data()) with pytest.raises(ValueError) as ex: - s = pbn.HoldoutLikelihood(df, 10, 0) + s = pbn.HoldoutLikelihood(df, test_ratio=10, seed=0) assert "test_ratio must be a number" in str(ex.value) with pytest.raises(ValueError) as ex: - s = pbn.HoldoutLikelihood(df, 0, 0) + s = pbn.HoldoutLikelihood(df, test_ratio=0, seed=0) assert "test_ratio must be a number" in str(ex.value) diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index eb977601..413db9d3 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -228,7 +228,7 @@ def test_add_cpds(): spbn.set_node_type("a", pbn.UnknownFactorType()) with pytest.raises(ValueError) as ex: - not spbn.cpd("a").fitted() + spbn.cpd("a").fitted() assert ( 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) @@ -237,7 +237,7 @@ def test_add_cpds(): assert spbn.cpd("b").fitted() with pytest.raises(ValueError) as ex: - not spbn.cpd("c").fitted() + spbn.cpd("c").fitted() assert ( 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) diff --git a/tests/serialization/serialize_factor_test.py b/tests/serialization/serialize_factor_test.py index 05fa9405..0e3a6fea 100644 --- a/tests/serialization/serialize_factor_test.py +++ b/tests/serialization/serialize_factor_test.py @@ -1,9 +1,11 @@ +import pickle + import numpy as np import pandas as pd import pytest + import pybnesian as pbn -from pybnesian import FactorType, Factor, LinearGaussianCPD, CKDE, DiscreteFactor -import pickle +from pybnesian import CKDE, DiscreteFactor, Factor, FactorType, LinearGaussianCPD @pytest.fixture @@ -265,11 +267,11 @@ def test_serialization_fitted_factor( assert loaded_newbis.variable() == "c" assert set(loaded_newbis.evidence()) == set(["a", "b"]) assert loaded_newbis.fitted() - assert type(loaded_newbis.type()) == NewType + assert isinstance(loaded_newbis.type(), NewType) nn = NewFactorBis("a", []) assert loaded_newbis.type() == nn.type() assert loaded_newbis.some_fit_data == "fitted" - assert type(loaded_newbis.type()) == type(loaded_new.type()) + assert isinstance(loaded_newbis.type(), type(loaded_new.type())) assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() From 4cebd863a1091787bdf48e94e871ce73e413db85 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 11:43:13 +0200 Subject: [PATCH 07/75] util_test imports updated --- tests/dataset/crossvalidation_test.py | 4 ++-- tests/dataset/holdout_test.py | 4 ++-- tests/factors/continuous/CKDE_test.py | 20 +++++++++---------- .../continuous/LinearGaussianCPD_test.py | 16 +++++++-------- tests/factors/continuous/ProductKDE_test.py | 12 +++++------ tests/factors/discrete/DiscreteFactor_test.py | 2 +- .../learning/algorithms/hillclimbing_test.py | 6 +++--- tests/learning/operators/operatorpool_test.py | 4 ++-- tests/learning/operators/operatorset_test.py | 4 ++-- tests/learning/parameters/mle_test.py | 4 ++-- tests/learning/scores/bic_test.py | 4 ++-- tests/learning/scores/cvlikelihood_test.py | 2 +- .../learning/scores/holdoutlikelihood_test.py | 2 +- tests/models/BayesianNetwork_test.py | 6 +++--- tests/models/BayesianNetwork_type_test.py | 4 ++-- tests/models/DynamicBayesianNetwork_test.py | 10 +++++----- tests/models/SemiparametricBN_test.py | 6 +++--- tests/serialization/serialize_models_test.py | 16 +++++++-------- 18 files changed, 63 insertions(+), 63 deletions(-) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 3ed8faf9..6fb10867 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,11 +1,11 @@ import numpy as np -import util_test +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def test_cv_disjoint_indices(): diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index c835ae91..73fe7913 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,12 +1,12 @@ import numpy as np import pandas as pd -import util_test +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def test_holdout_disjoint(): diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index fdbc6c30..2137e483 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -2,18 +2,18 @@ import pandas as pd import pyarrow as pa import pytest -import util_test from scipy.stats import gaussian_kde from scipy.stats import multivariate_normal as mvn from scipy.stats import norm +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 SMALL_SIZE = 10 TEST_SIZE = 50 -df = util_test.generate_normal_data(SIZE, seed=0) -df_small = util_test.generate_normal_data(SMALL_SIZE, seed=0) +df = generate_normal_data(SIZE, seed=0) +df_small = generate_normal_data(SMALL_SIZE, seed=0) df_float = df.astype("float32") df_small_float = df_small.astype("float32") @@ -207,7 +207,7 @@ def train_scipy_ckde(data, variable, evidence): ) if evidence: scipy_kde_marg = gaussian_kde( - npdata_marg[~nan_rows, :].T, bw_method=scipy_kde_joint.covariance_factor() + npdata_marg[~nan_rows, :].T, bw_method=scipy_kde_joint.factor ) else: scipy_kde_marg = None @@ -311,7 +311,7 @@ def _test_ckde_logl(variable, evidence, _df, _test_df): else: assert np.all(np.isclose(logl, scipy)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") for variable, evidence in [ @@ -359,7 +359,7 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): else: assert np.all(np.isclose(logl, scipy, equal_nan=True)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) @@ -432,7 +432,7 @@ def _test_ckde_slogl(variable, evidence, _df, _test_df): else: assert np.isclose(cpd.slogl(_test_df), scipy_logl.sum()) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") for variable, evidence in [ @@ -481,7 +481,7 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): else: assert np.isclose(cpd.slogl(_test_df), np.nansum(scipy_logl)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) @@ -546,7 +546,7 @@ def _test_ckde_cdf(variable, evidence, _df, _test_df): else: assert np.all(np.isclose(cdf, scipy)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") for variable, evidence in [ @@ -594,7 +594,7 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): else: assert np.all(np.isclose(cdf, scipy, equal_nan=True)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 1d4320c4..4b74f056 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -1,14 +1,14 @@ import numpy as np import pandas as pd import pyarrow as pa -import util_test from scipy.stats import norm +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def test_lg_variable(): @@ -135,7 +135,7 @@ def numpy_cdf(test_df, variable, evidence, beta, variance): def test_lg_logl(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) for variable, evidence in [ ("a", []), @@ -173,7 +173,7 @@ def test_lg_logl(): def test_lg_logl_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) @@ -224,7 +224,7 @@ def test_lg_logl_null(): def test_lg_slogl(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) for variable, evidence in [ ("a", []), @@ -262,7 +262,7 @@ def test_lg_slogl(): def test_lg_slogl_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) @@ -312,7 +312,7 @@ def test_lg_slogl_null(): def test_lg_cdf(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) for variable, evidence in [ ("a", []), @@ -349,7 +349,7 @@ def test_lg_cdf(): def test_lg_cdf_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 30998342..0920a5ad 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -1,14 +1,14 @@ import numpy as np import pyarrow as pa import pytest -import util_test from scipy.stats import gaussian_kde +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import BandwidthSelector SIZE = 500 -df = util_test.generate_normal_data(SIZE, seed=0) +df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") @@ -291,7 +291,7 @@ def _test_productkde_logl_iter(variables, _df, _test_df): else: assert np.all(np.isclose(logl, scipy)) - test_df = util_test.generate_normal_data(50, seed=1) + test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: @@ -357,7 +357,7 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) @@ -447,7 +447,7 @@ def _test_productkde_slogl_iter(variables, _df, _test_df): ) ) - test_df = util_test.generate_normal_data(50, seed=1) + test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: @@ -516,7 +516,7 @@ def _test_productkde_slogl_null_iter(variables, _df, _test_df): TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) + test_df = generate_normal_data(TEST_SIZE, seed=1) test_df_float = test_df.astype("float32") np.random.seed(0) diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 6f9ccc27..709fa925 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -2,7 +2,7 @@ import pandas as pd import pyarrow as pa import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index b62dc881..8e46928d 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,12 +1,12 @@ import numpy as np -import util_test +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import BayesianNetwork, BayesianNetworkType -df = util_test.generate_normal_data(1000) +df = generate_normal_data(1000) # TODO: Add tests for normal data with dependencies -# dep_df = util_test.generate_normal_data_dep(1000) +# dep_df = generate_normal_data_dep(1000) def test_hc_estimate(): diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 2b2b0709..570e2b34 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,10 +1,10 @@ import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def test_create(): diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index 13ecfa66..db581dd6 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,11 +1,11 @@ import numpy as np import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def test_create_change_node(): diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 82985ee0..aa8a9031 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,11 +1,11 @@ import numpy as np import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def numpy_fit_mle_lg(data, variable, evidence): diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 7583232b..5c103013 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,12 +1,12 @@ import numpy as np -import util_test from scipy.stats import norm +from util_test import generate_normal_data import pybnesian as pbn SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) def numpy_local_score(data, variable, evidence): diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index 0bdc2c72..e8ac3b69 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -45,7 +45,7 @@ def numpy_local_score( ) if evidence: k_marg = gaussian_kde( - evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor() + evidence_data.to_numpy().T, bw_method=k_joint.factor ) loglik += np.sum( k_joint.logpdf(test_node_data.to_numpy().T) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 9d7c5227..21447064 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -47,7 +47,7 @@ def numpy_local_score( ) if evidence: k_marg = gaussian_kde( - evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor() + evidence_data.to_numpy().T, bw_method=k_joint.factor ) loglik = np.sum( k_joint.logpdf(test_node_data.to_numpy().T) diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index e5a6de0f..68767db3 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,11 +1,11 @@ import numpy as np import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import BayesianNetwork, GaussianNetwork -df = util_test.generate_normal_data(10000) +df = generate_normal_data(10000) def test_create_bn(): @@ -329,7 +329,7 @@ def test_bn_logl(): gbn.fit(df) - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) ll = gbn.logl(test_df) sll = gbn.slogl(test_df) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 68e27ade..71ae83de 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,4 +1,4 @@ -import util_test +from util_test import generate_normal_data_indep import pybnesian as pbn from pybnesian import ( @@ -153,7 +153,7 @@ def test_new_specific_bn_type(): assert sp1.num_arcs() == sp3.num_arcs() == 0 assert sp2.arcs() == [("a", "b")] - df = util_test.generate_normal_data_indep(1000) + df = generate_normal_data_indep(1000) bic = pbn.BIC(df) start = SpecificNetwork(["a", "b", "c", "d"]) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 05304f59..3bcb0e62 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd import pytest -import util_test from scipy.stats import norm +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import ( @@ -13,7 +13,7 @@ GaussianNetwork, ) -df = util_test.generate_normal_data(1000) +df = generate_normal_data(1000) def test_create_dbn(): @@ -201,8 +201,8 @@ def test_logl_dbn(): gbn.fit(df) - test_df = util_test.generate_normal_data(100) - ground_truth_ll = numpy_logl(gbn, util_test.generate_normal_data(100)) + test_df = generate_normal_data(100) + ground_truth_ll = numpy_logl(gbn, generate_normal_data(100)) ll = gbn.logl(test_df) assert np.all(np.isclose(ground_truth_ll, ll)) @@ -237,6 +237,6 @@ def test_slogl_dbn(): transition_bn.add_arc("d_t_1", "d_t_0") gbn.fit(df) - test_df = util_test.generate_normal_data(100) + test_df = generate_normal_data(100) ll = numpy_logl(gbn, test_df) assert np.isclose(gbn.slogl(test_df), ll.sum()) diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 413db9d3..683f976b 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,11 +1,11 @@ import numpy as np import pytest -import util_test +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN -df = util_test.generate_normal_data(10000) +df = generate_normal_data(10000) def test_create_spbn(): @@ -253,7 +253,7 @@ def test_logl(): spbn.fit(df) - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) ll = spbn.logl(test_df) sll = spbn.slogl(test_df) diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index 8a882549..07ed37d9 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -2,7 +2,7 @@ import pyarrow as pa import pytest -import util_test +from util_test import generate_discrete_data_dependent, generate_normal_data_indep import pybnesian as pbn from pybnesian import ( @@ -265,11 +265,11 @@ def other_fit_bytes(): cpd_a = LinearGaussianCPD("a", [], [0], 0.5) cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) - df_continuous = util_test.generate_normal_data_indep(100) + df_continuous = generate_normal_data_indep(100) cpd_c = CKDE("c", []) cpd_c.fit(df_continuous) - df_discrete = util_test.generate_discrete_data_dependent(100) + df_discrete = generate_discrete_data_dependent(100) df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) @@ -583,10 +583,10 @@ def cond_other_fit_bytes(): cpd_c = CKDE("c", ["a"]) cpd_d = DiscreteFactor("d", []) - df_continuous = util_test.generate_normal_data_indep(100) + df_continuous = generate_normal_data_indep(100) cpd_c.fit(df_continuous) - df_discrete = util_test.generate_discrete_data_dependent(100) + df_discrete = generate_discrete_data_dependent(100) df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) @@ -838,7 +838,7 @@ def dyn_gaussian_fit_bytes(): gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) gaussian.static_bn().add_arc("a_t_2", "d_t_1") gaussian.transition_bn().add_arc("c_t_2", "b_t_0") - df = util_test.generate_normal_data_indep(1000) + df = generate_normal_data_indep(1000) gaussian.fit(df) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -917,8 +917,8 @@ def dyn_other_fit_bytes(): assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) - df_continuous = util_test.generate_normal_data_indep(1000) - df_discrete = util_test.generate_discrete_data_dependent(1000) + df_continuous = generate_normal_data_indep(1000) + df_discrete = generate_discrete_data_dependent(1000) df = df_continuous df["b"] = df_discrete["B"] dyn_other.fit(df) From 8dc996a0d80bbacfefb5c20d54d6eec87c36da8f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 14:26:52 +0200 Subject: [PATCH 08/75] dataset/ commented --- pybnesian/dataset/dataset.cpp | 23 +++++++++++++++++++++-- pybnesian/dataset/holdout_adaptator.hpp | 2 +- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pybnesian/dataset/dataset.cpp b/pybnesian/dataset/dataset.cpp index 6df14d07..b3c3d613 100644 --- a/pybnesian/dataset/dataset.cpp +++ b/pybnesian/dataset/dataset.cpp @@ -197,6 +197,13 @@ std::vector DataFrame::column_names() const { return names; } +/** + * @brief Returns the number of null elements in the array. + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return int64_t Number of null elements. + */ int64_t null_count(Array_iterator begin, Array_iterator end) { int64_t r = 0; for (auto it = begin; it != end; it++) { @@ -204,7 +211,13 @@ int64_t null_count(Array_iterator begin, Array_iterator end) { } return r; } - +/** + * @brief Returns the combined bitmap of the columns. + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return Buffer_ptr Combined bitmap. + */ Buffer_ptr combined_bitmap(Array_iterator begin, Array_iterator end) { if (null_count(begin, end) > 0) { Array_iterator first_null_col = end; @@ -233,7 +246,13 @@ Buffer_ptr combined_bitmap(Array_iterator begin, Array_iterator end) { return nullptr; } } - +/** + * @brief Returns the number of valid rows in the columns (The dataframe may have non-valid rows?). + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return int64_t Number of valid rows. + */ int64_t valid_rows(Array_iterator begin, Array_iterator end) { if (std::distance(begin, end) == 0) { return 0; diff --git a/pybnesian/dataset/holdout_adaptator.hpp b/pybnesian/dataset/holdout_adaptator.hpp index 9f11a39b..284d6178 100644 --- a/pybnesian/dataset/holdout_adaptator.hpp +++ b/pybnesian/dataset/holdout_adaptator.hpp @@ -49,7 +49,7 @@ class HoldOut { if (test_rows == 0 || train_rows == 0) { throw std::invalid_argument("Wrong test_ratio (" + std::to_string(test_ratio) + - "selected for HoldOut.\n" + ") selected for HoldOut.\n" "Generated train instances: " + std::to_string(train_rows) + "\n" From 00acbc4fc5f8b08082b3c06563e93dffc93baa69 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 14:34:48 +0200 Subject: [PATCH 09/75] util commented --- pybnesian/util/basic_eigen_ops.hpp | 8 ++++++ pybnesian/util/progress.hpp | 8 ++++++ pybnesian/util/validate_options.cpp | 37 +++++++++++++++++++++----- pybnesian/util/validate_whitelists.hpp | 17 ++++++++++++ 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/pybnesian/util/basic_eigen_ops.hpp b/pybnesian/util/basic_eigen_ops.hpp index a1ac9874..99715918 100644 --- a/pybnesian/util/basic_eigen_ops.hpp +++ b/pybnesian/util/basic_eigen_ops.hpp @@ -132,6 +132,14 @@ Matrix sqrt_matrix(const M& m) { } // Checks whether M is positive definite. +/** + * @brief Checks whether a matrix M is positive definite. + * + * @tparam M Matrix type. + * @param m Matrix to check. + * @return true If M is positive definite. + * @return false If M is not positive definite. + */ template bool is_psd(const M& m) { using MatrixType = Matrix; diff --git a/pybnesian/util/progress.hpp b/pybnesian/util/progress.hpp index 31038d13..fb481c67 100644 --- a/pybnesian/util/progress.hpp +++ b/pybnesian/util/progress.hpp @@ -65,6 +65,14 @@ class IndeterminateSpinner : public BaseIndeterminateSpinner { indicators::ProgressSpinner m_spinner; }; +/** + * @brief Creates a spinner based on the verbose level. + * + * @tparam Args Arguments to pass to the spinner. + * @param verbose_level 0: no spinner, 1: indeterminate spinner + * @param additional_args Additional arguments to pass to the spinner. + * @return std::unique_ptr Pointer to the spinner. + */ template std::unique_ptr indeterminate_spinner(int verbose_level, Args&&... additional_args) { switch (verbose_level) { diff --git a/pybnesian/util/validate_options.cpp b/pybnesian/util/validate_options.cpp index f524b456..10eb89b6 100644 --- a/pybnesian/util/validate_options.cpp +++ b/pybnesian/util/validate_options.cpp @@ -13,13 +13,25 @@ using models::GaussianNetworkType, models::KDENetworkType, models::Semiparametri namespace util { +/** + * @brief Checks if the given score is valid for the given Bayesian network type e.g., "bic","bge, "cv-lik", + * "holdout-lik", "validated-lik". + * + * @param df + * @param bn_type + * @param score + * @param seed + * @param num_folds + * @param test_holdout_ratio + * @return std::unique_ptr + */ std::unique_ptr check_valid_score(const DataFrame& df, const BayesianNetworkType& bn_type, const std::optional& score, int seed, int num_folds, double test_holdout_ratio) { - if (score) { + if (score) { // If score is specified if (*score == "bic") return std::make_unique(df); if (*score == "bge") return std::make_unique(df); if (*score == "cv-lik") return std::make_unique(df, num_folds, seed); @@ -33,17 +45,30 @@ std::unique_ptr check_valid_score(const DataFrame& df, "\"bic\" (Bayesian Information Criterion), \"bge\" (Bayesian Gaussian equivalent), " "\"cv-lik\" (Cross-Validated likelihood), \"holdout-l\" (Hold-out likelihood) " " or \"validated-lik\" (Validated likelihood with cross-validation)."); - } else { + } else { // If score is not specified if (bn_type == GaussianNetworkType::get_ref()) { - return std::make_unique(df); + return std::make_unique(df); // Default score for GaussianNetworkType } else if (bn_type == SemiparametricBNType::get_ref() || bn_type == KDENetworkType::get_ref()) { - return std::make_unique(df, test_holdout_ratio, num_folds, seed); + return std::make_unique( + df, test_holdout_ratio, num_folds, seed); // Default score for SemiparametricBNType and KDENetworkType } else { throw std::invalid_argument("Default score not defined for " + bn_type.ToString() + "."); } } } +/** + * @brief Checks if the given operators are valid for the given Bayesian network type ["arcs", "node_type"]. + * Otherwise, it returns the default operators for the given Bayesian network type + * + * @param bn_type + * @param operators + * @param arc_blacklist + * @param arc_whitelist + * @param max_indegree + * @param type_whitelist + * @return std::shared_ptr + */ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn_type, const std::optional>& operators, const ArcStringVector& arc_blacklist, @@ -52,7 +77,7 @@ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn const FactorTypeVector& type_whitelist) { std::vector> res; - if (operators && !operators->empty()) { + if (operators && !operators->empty()) { // If operators are specified for (auto& op : *operators) { if (op == "arcs") { res.push_back(std::make_shared(arc_blacklist, arc_whitelist, max_indegree)); @@ -71,7 +96,7 @@ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn "\"arcs\" (Changes in arcs; addition, removal and flip) or " "\"node_type\" (Change of node type)"); } - } else { + } else { // If operators are not specified if (bn_type == GaussianNetworkType::get_ref()) res.push_back(std::make_shared(arc_blacklist, arc_whitelist, max_indegree)); else if (bn_type == SemiparametricBNType::get_ref()) { diff --git a/pybnesian/util/validate_whitelists.hpp b/pybnesian/util/validate_whitelists.hpp index 81976e03..c06880b9 100644 --- a/pybnesian/util/validate_whitelists.hpp +++ b/pybnesian/util/validate_whitelists.hpp @@ -151,6 +151,15 @@ ListRestrictions validate_restrictions(const Model& g, return r; } +/** + * @brief Validate the arc restrictions for a model. + * + * @tparam Model + * @param g + * @param varc_blacklist + * @param varc_whitelist + * @return ListRestrictions + */ template ListRestrictions validate_restrictions(const Model& g, const ArcStringVector& varc_blacklist, @@ -181,6 +190,14 @@ ListRestrictions validate_restrictions(const Model& g, return r; } +/** + * @brief Validates the type restrictions for a model. + * + * @tparam Model + * @param g + * @param type_blacklist + * @param type_whitelist + */ template void validate_type_restrictions(const Model& g, const FactorTypeVector& type_blacklist, From c346b66ea2bae64de0c0f1d2eb8af360f335362d Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 14:48:34 +0200 Subject: [PATCH 10/75] partial commenting --- pybnesian/factors/continuous/CKDE.cpp | 5 + pybnesian/factors/continuous/CKDE.hpp | 6 + pybnesian/factors/discrete/DiscreteFactor.cpp | 1 + pybnesian/kde/KDE.cpp | 12 ++ pybnesian/kde/KDE.hpp | 192 ++++++++++++++++-- pybnesian/kde/NormalReferenceRule.hpp | 66 ++++-- pybnesian/kde/ProductKDE.hpp | 5 +- pybnesian/kde/ScottsBandwidth.hpp | 49 ++++- pybnesian/learning/scores/cv_likelihood.cpp | 12 ++ pybnesian/learning/scores/cv_likelihood.hpp | 5 + .../learning/scores/holdout_likelihood.hpp | 5 + .../learning/scores/validated_likelihood.hpp | 30 ++- pybnesian/models/BayesianNetwork.hpp | 18 +- pybnesian/opencl/opencl_config.hpp | 13 ++ 14 files changed, 380 insertions(+), 39 deletions(-) diff --git a/pybnesian/factors/continuous/CKDE.cpp b/pybnesian/factors/continuous/CKDE.cpp index c0286c0e..3eabe5f4 100644 --- a/pybnesian/factors/continuous/CKDE.cpp +++ b/pybnesian/factors/continuous/CKDE.cpp @@ -40,6 +40,11 @@ std::shared_ptr CKDEType::new_factor(const ConditionalBayesianNetworkBas return generic_new_factor(variable, evidence, args, kwargs); } +/** + * @brief Public function to learn the CKDE parameters given the data. + * + * @param df Data. + */ void CKDE::fit(const DataFrame& df) { auto type = df.same_type(m_variables); diff --git a/pybnesian/factors/continuous/CKDE.hpp b/pybnesian/factors/continuous/CKDE.hpp index f179ab7d..053880e0 100644 --- a/pybnesian/factors/continuous/CKDE.hpp +++ b/pybnesian/factors/continuous/CKDE.hpp @@ -179,6 +179,12 @@ class CKDE : public Factor { KDE m_marg; }; +/** + * @brief Private function to learn the CKDE parameters given the data. + * + * @tparam ArrowType Arrow Data type. + * @param df Data. + */ template void CKDE::_fit(const DataFrame& df) { m_joint.fit(df); diff --git a/pybnesian/factors/discrete/DiscreteFactor.cpp b/pybnesian/factors/discrete/DiscreteFactor.cpp index 35142b6d..b0584db4 100644 --- a/pybnesian/factors/discrete/DiscreteFactor.cpp +++ b/pybnesian/factors/discrete/DiscreteFactor.cpp @@ -210,6 +210,7 @@ Array_ptr DiscreteFactor::sample(int n, const DataFrame& evidence_values, unsign std::string DiscreteFactor::ToString() const { std::stringstream stream; stream << std::setprecision(3); + // Evidence refers to the parents if (!evidence().empty()) { const auto& e = evidence(); stream << "[DiscreteFactor] P(" << variable() << " | " << e[0]; diff --git a/pybnesian/kde/KDE.cpp b/pybnesian/kde/KDE.cpp index fb6133fe..6271e3ab 100644 --- a/pybnesian/kde/KDE.cpp +++ b/pybnesian/kde/KDE.cpp @@ -67,6 +67,12 @@ void KDE::fit(const DataFrame& df) { m_fitted = true; } +/** + * @brief Public function to calculate the log-likelihood vector of the given data. + * + * @param df Data. + * @return VectorXd Log-likelihood vector. + */ VectorXd KDE::logl(const DataFrame& df) const { check_fitted(); auto type = df.same_type(m_variables); @@ -85,6 +91,12 @@ VectorXd KDE::logl(const DataFrame& df) const { } } +/** + * @brief Public function to calculate the log-likelihood sum of the given data. + * + * @param df Data. + * @return double Log-likelihood sum. + */ double KDE::slogl(const DataFrame& df) const { check_fitted(); auto type = df.same_type(m_variables); diff --git a/pybnesian/kde/KDE.hpp b/pybnesian/kde/KDE.hpp index 049078f7..0604e766 100644 --- a/pybnesian/kde/KDE.hpp +++ b/pybnesian/kde/KDE.hpp @@ -13,7 +13,27 @@ using opencl::OpenCLConfig, opencl::OpenCL_kernel_traits; namespace kde { +/** + * @brief Class for calculating the Univariate Kernel Density Estimation. + + * + */ struct UnivariateKDE { + /** + * @brief Executes the log-likelihood calculation for a univariate KDE model. + * + * @tparam ArrowType Arrow data type. + * @param training_vec Training data. + * @param training_length Number of training instances. + * @param test_vec Test data. + * @param int Unused. + * @param test_offset ? + * @param test_length Number of test instances. + * @param int Unused. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param output_mat Output matrix. + */ template void static execute_logl_mat(const cl::Buffer& training_vec, const unsigned int training_length, @@ -26,6 +46,7 @@ struct UnivariateKDE { const typename ArrowType::c_type lognorm_const, cl::Buffer&, cl::Buffer& output_mat); + template static void execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer&, @@ -40,6 +61,21 @@ struct UnivariateKDE { cl::Buffer& output_mat); }; +/** + * @brief Executes the log-likelihood calculation for a univariate KDE model for each variable. + * + * @tparam ArrowType Arrow data type. + * @param training_vec Training data. + * @param training_length Number of training instances. + * @param test_vec Test data. + * @param int Unused. + * @param test_offset + * @param test_length Number of test instances. + * @param int Unused. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param output_mat Output matrix. + */ template void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, const unsigned int training_length, @@ -53,6 +89,22 @@ void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, cl::Buffer&, cl::Buffer& output_mat) { auto& opencl = OpenCLConfig::get(); +// TODO: This is the kernel that is executed, might be wrong? + // OpenCL kernel for calculating the log-likelihood values for each test instance + // __kernel void logl_values_1d_mat_double(__global double *restrict train_vector, + // __private uint train_rows, + // __global double *restrict test_vector, + // __private uint test_offset, + // __constant double *standard_deviation, + // __private double lognorm_factor, + // __global double *restrict result) { + // int i = get_global_id(0); + // int train_idx = ROW(i, train_rows); + // int test_idx = COL(i, train_rows); + // double d = (train_vector[train_idx] - test_vector[test_offset + test_idx]) / standard_deviation[0]; + + // result[i] = (-0.5*d*d) + lognorm_factor; + // } auto& k_logl_values_1d_mat = opencl.kernel(OpenCL_kernel_traits::logl_values_1d_mat); k_logl_values_1d_mat.setArg(0, training_vec); k_logl_values_1d_mat.setArg(1, training_length); @@ -61,11 +113,14 @@ void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, k_logl_values_1d_mat.setArg(4, cholesky); k_logl_values_1d_mat.setArg(5, lognorm_const); k_logl_values_1d_mat.setArg(6, output_mat); + auto& queue = opencl.queue(); + // ? Calculates the log-likelihood values for each test instance RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_logl_values_1d_mat, cl::NullRange, cl::NDRange(training_length * test_length), cl::NullRange)); } +// Computes conditional mu. template void UnivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer&, @@ -88,6 +143,8 @@ void UnivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, k_conditional_means_1d.setArg(5, transform_mean); k_conditional_means_1d.setArg(6, output_mat); auto& queue = opencl.queue(); + + // ? Calculates the log-likelihood values for each test instance RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_conditional_means_1d, cl::NullRange, cl::NDRange(training_rows * test_length), cl::NullRange)); } @@ -119,7 +176,22 @@ struct MultivariateKDE { cl::Buffer& tmp_mat, cl::Buffer& output_mat); }; - +/** + * @brief Executes the log-likelihood calculation for a multivariate KDE model for each variable. + * + * @tparam ArrowType Arrow data type. + * @param training_mat Training data. + * @param training_rows Number of training instances. + * @param test_mat Test data. + * @param test_physical_rows Number of test instances. + * @param test_offset ? + * @param test_length Number of test instances. + * @param matrices_cols Number of columns of the matrices. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param tmp_mat Temporary matrix. + * @param output_mat Output matrix. + */ template void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, const unsigned int training_rows, @@ -134,19 +206,55 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, cl::Buffer& output_mat) { auto& opencl = OpenCLConfig::get(); + // __kernel void substract_double(__global double* restrict training_matrix, + // __private uint training_physical_rows, + // __private uint training_offset, + // __private uint training_rows, + // __global double* restrict test_matrix, + // __private uint test_physical_rows, + // __private uint test_offset, + // __private uint test_row_idx, + // __global double* restrict res) { + // uint i = get_global_id(0); + // uint r = ROW(i, training_rows) + training_offset; + // uint c = COL(i, training_rows); + // res[i] = test_matrix[IDX(test_offset + test_row_idx, c, test_physical_rows)] - + // training_matrix[IDX(r, c, training_physical_rows)]; + // } auto& k_substract = opencl.kernel(OpenCL_kernel_traits::substract); + // __kernel void solve_double(__global double* restrict diff_matrix, + // __private uint diff_matrix_rows, + // __private uint matrices_cols, + // __global double* restrict cholesky_matrix) { + // uint r = get_global_id(0); + + // for (uint c = 0; c < matrices_cols; c++) { + // for (uint i = 0; i < c; i++) { + // diff_matrix[IDX(r, c, diff_matrix_rows)] -= + // cholesky_matrix[IDX(c, i, matrices_cols)] * diff_matrix[IDX(r, i, diff_matrix_rows)]; + // } + // diff_matrix[IDX(r, c, diff_matrix_rows)] /= cholesky_matrix[IDX(c, c, matrices_cols)]; + // } + // } auto& k_solve = opencl.kernel(OpenCL_kernel_traits::solve); k_solve.setArg(0, tmp_mat); k_solve.setArg(2, matrices_cols); k_solve.setArg(3, cholesky); + // __kernel void square_double(__global double* restrict m) { + // uint idx = get_global_id(0); + // double d = m[idx]; + // m[idx] = d * d; + // } auto& k_square = opencl.kernel(OpenCL_kernel_traits::square); k_square.setArg(0, tmp_mat); auto& queue = opencl.queue(); - if (training_rows > test_length) { + if (training_rows > + test_length) { // When the number of training instances is greater than the number of test instances + // Test Matrix - Training Matrix k_substract.setArg(0, training_mat); k_substract.setArg(1, training_rows); k_substract.setArg(2, 0u); @@ -165,14 +273,18 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, k_logl_values_mat.setArg(3, training_rows); k_logl_values_mat.setArg(5, lognorm_const); + // NOTE: Calculates the log-likelihood values for each test instance for (unsigned int i = 0; i < test_length; ++i) { k_substract.setArg(7, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_substract, cl::NullRange, cl::NDRange(training_rows * matrices_cols), cl::NullRange)); + RAISE_ENQUEUEKERNEL_ERROR( queue.enqueueNDRangeKernel(k_solve, cl::NullRange, cl::NDRange(training_rows), cl::NullRange)); + RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_square, cl::NullRange, cl::NDRange(training_rows * matrices_cols), cl::NullRange)); + k_logl_values_mat.setArg(4, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_logl_values_mat, cl::NullRange, cl::NDRange(training_rows), cl::NullRange)); @@ -196,6 +308,7 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, k_logl_values_mat.setArg(3, training_rows); k_logl_values_mat.setArg(5, lognorm_const); + // ? Calculates the log-likelihood values for each test instance for (unsigned int i = 0; i < training_rows; ++i) { k_substract.setArg(7, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( @@ -210,7 +323,7 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, } } } - +// Computes conditional mu. template void MultivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer& marg_training, @@ -447,7 +560,14 @@ DataFrame KDE::_training_data() const { auto rb = arrow::RecordBatch::Make(schema, N, columns); return DataFrame(rb); } - +/** + * @brief Private function to learn the KDE parameters given the training data. + * Used in the public function fit in KDE.cpp. + * + * @tparam ArrowType Arrow data type. + * @tparam contains_null Boolean indicating if the training data contains null values. + * @param df Training data. + */ template void KDE::_fit(const DataFrame& df) { using CType = typename ArrowType::c_type; @@ -457,15 +577,15 @@ void KDE::_fit(const DataFrame& df) { m_bandwidth = m_bselector->bandwidth(df, m_variables); auto llt_cov = m_bandwidth.llt(); - auto llt_matrix = llt_cov.matrixLLT(); + auto cholesky = llt_cov.matrixLLT(); auto& opencl = OpenCLConfig::get(); if constexpr (std::is_same_v) { - m_H_cholesky = opencl.copy_to_buffer(llt_matrix.data(), d * d); + m_H_cholesky = opencl.copy_to_buffer(cholesky.data(), d * d); } else { using MatrixType = Matrix; - MatrixType casted_cholesky = llt_matrix.template cast(); + MatrixType casted_cholesky = cholesky.template cast(); m_H_cholesky = opencl.copy_to_buffer(casted_cholesky.data(), d * d); } @@ -473,10 +593,22 @@ void KDE::_fit(const DataFrame& df) { N = training_data->rows(); m_training = opencl.copy_to_buffer(training_data->data(), N * d); - m_lognorm_const = - -llt_matrix.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); + // NOTE: The determinant of the bandwidth matrix is the product of the diagonal elements of the cholesky + // - log(|h|) - 1/2 * d * log(2 * pi) - log(N) + m_lognorm_const = -cholesky.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); } +/** + * @brief Learns the KDE parameters given the bandwidth matrix, the training data, the training type (?) and the number + * of training instances. + * + * @tparam ArrowType Arrow data type. + * @tparam EigenMatrix Eigen matrix type. + * @param bandwidth Bandwidth matrix. + * @param training_data Training data. + * @param training_type Training type. + * @param training_instances Number of training instances. + */ template void KDE::fit(EigenMatrix bandwidth, cl::Buffer training_data, @@ -506,10 +638,19 @@ void KDE::fit(EigenMatrix bandwidth, m_training = training_data; m_training_type = training_type; N = training_instances; + + // NOTE: The determinant of the bandwidth matrix is the product of the diagonal elements of the cholesky m_lognorm_const = -cholesky.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); m_fitted = true; } +/** + * @brief Calculates Log-likelihood of the given data with OpenCL. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @return VectorXd Log-likelihood values. + */ template VectorXd KDE::_logl(const DataFrame& df) const { using CType = typename ArrowType::c_type; @@ -517,14 +658,15 @@ VectorXd KDE::_logl(const DataFrame& df) const { auto logl_buff = logl_buffer(df); auto& opencl = OpenCLConfig::get(); - if (df.null_count(m_variables) == 0) { + // TODO I don't understand how the log-likelihood is calculated + if (df.null_count(m_variables) == 0) { // No null variables -> Returns the data? VectorType read_data(df->num_rows()); opencl.read_from_buffer(read_data.data(), logl_buff, df->num_rows()); if constexpr (!std::is_same_v) return read_data.template cast(); else return read_data; - } else { + } else { // Null variables -> Returns the data without nulls auto m = df.valid_rows(m_variables); VectorType read_data(m); auto bitmap = df.combined_bitmap(m_variables); @@ -560,7 +702,13 @@ double KDE::_slogl(const DataFrame& df) const { opencl.read_from_buffer(&result, buffer_sum, 1); return static_cast(result); } - +/** + * @brief Calculates the log-likelihood of the given data using _logl_impl. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::logl_buffer(const DataFrame& df) const { auto& opencl = OpenCLConfig::get(); @@ -575,6 +723,14 @@ cl::Buffer KDE::logl_buffer(const DataFrame& df) const { return _logl_impl(test_buffer, m); } +/** + * @brief Calculates the log-likelihood of the given data using _logl_impl. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @param bitmap Bitmap. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::logl_buffer(const DataFrame& df, Buffer_ptr& bitmap) const { auto& opencl = OpenCLConfig::get(); @@ -589,6 +745,16 @@ cl::Buffer KDE::logl_buffer(const DataFrame& df, Buffer_ptr& bitmap) const { return _logl_impl(test_buffer, m); } +// TODO Check here the interesting part? +/** + * @brief Function where the log-likelihood are calculated with OpenCL?. + * + * @tparam ArrowType Arrow data type. + * @tparam KDEType KDE type. + * @param test_buffer Test data. + * @param m Number of test instances. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { using CType = typename ArrowType::c_type; @@ -619,6 +785,7 @@ cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { m_lognorm_const, tmp_mat_buffer, mat_logls); + // Calculates the log-likelihood values for each test instance opencl.logsumexp_cols_offset(mat_logls, N, allocated_m, res, i * allocated_m); } auto remaining_m = m - (iterations - 1) * allocated_m; @@ -634,6 +801,7 @@ cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { m_lognorm_const, tmp_mat_buffer, mat_logls); + // Calculates the log-likelihood values for each test instance opencl.logsumexp_cols_offset(mat_logls, N, remaining_m, res, (iterations - 1) * allocated_m); return res; diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index 2da850b7..7b93a213 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -9,13 +9,22 @@ namespace kde { class NormalReferenceRule : public BandwidthSelector { public: + /** + * @brief Public function for calculating the diagonal bandwidth matrix using the Normal Reference Rule given the + * data and variables. + * + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return VectorXd(0); size_t valid_rows = df.valid_rows(variables); - if (valid_rows <= variables.size()) { + if (valid_rows <= variables.size()) { // If the number of (valid) rows is less than the number of variables std::stringstream ss; - ss << "Diagonal bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "NormalReferenceRule::diag_bandwidth -> Diagonal bandwidth matrix of " + << std::to_string(variables.size()) << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -30,17 +39,28 @@ class NormalReferenceRule : public BandwidthSelector { case Type::FLOAT: return diag_bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "NormalReferenceRule::diag_bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data " + "is expected."); } } - + /** + * @brief Public function for calculating the bandwidth matrix using the Normal Reference Rule given the data and + * variables. + * + * @param df Data + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return MatrixXd(0, 0); auto valid_rows = df.valid_rows(variables); - if (static_cast(valid_rows) <= variables.size()) { + if (static_cast(valid_rows) <= + variables.size()) { // If the number of (valid) rows is less than the number of variables std::stringstream ss; - ss << "Bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "NormalReferenceRule::bandwidth -> Bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -50,12 +70,15 @@ class NormalReferenceRule : public BandwidthSelector { } switch (df.same_type(variables)->id()) { + // Here the bandwidth is calculated using the function defined later in the private section. case Type::DOUBLE: return bandwidth(df, variables); case Type::FLOAT: return bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "NormalReferenceRule::bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } @@ -68,6 +91,15 @@ class NormalReferenceRule : public BandwidthSelector { } private: + /** + * @brief Private function to calculate the diagonal bandwidth matrix using the Normal Reference Rule given the data + * and variables. If the covariance matrix is not positive definite, an exception is thrown. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ template VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; @@ -77,16 +109,16 @@ class NormalReferenceRule : public BandwidthSelector { if (!util::is_psd(cov)) { std::stringstream ss; - ss << "Covariance matrix for variables [" << variables[0]; + ss << "NormalReferenceRule::diag_bandwidth -> Covariance matrix for variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } ss << "] is not positive-definite."; throw util::singular_covariance_data(ss.str()); } - + // The covariance diagonal is used to calculate the bandwidth auto diag = cov.diagonal(); - auto delta = (cov.array().colwise() * diag.cwiseInverse().array()).matrix(); + auto delta = (cov.array().colwise() * diag.cwiseInverse().array()).matrix(); // diag(cov)^ (-1) * cov auto delta_inv = delta.inverse(); auto N = static_cast(df.valid_rows(variables)); @@ -94,7 +126,9 @@ class NormalReferenceRule : public BandwidthSelector { auto delta_inv_trace = delta_inv.trace(); - // Estimate bandwidth using Equation (3.4) of Chacon and Duong (2018) + // NOTE: Estimate bandwidth using Equation (3.4) of Chacon and Duong (2018) + // [4*d*sqrt(det(delta))] / + // / [(2*trace(delta^(-1)*delta^(-1)) + trace(delta^(-1))^2) * N] auto k = 4 * d * std::sqrt(delta.determinant()) / (2 * (delta_inv * delta_inv).trace() + delta_inv_trace * delta_inv_trace); @@ -104,7 +138,15 @@ class NormalReferenceRule : public BandwidthSelector { return (std::pow(k / N, 2. / (d + 4.)) * diag).template cast(); } } - + /** + * @brief Private function to calculate the bandwidth matrix using the Normal Reference Rule given the data and + * variables. If the covariance matrix is not positive definite, an exception is thrown. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ template MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; diff --git a/pybnesian/kde/ProductKDE.hpp b/pybnesian/kde/ProductKDE.hpp index ca211beb..f666747e 100644 --- a/pybnesian/kde/ProductKDE.hpp +++ b/pybnesian/kde/ProductKDE.hpp @@ -1,12 +1,13 @@ #ifndef PYBNESIAN_KDE_PRODUCTKDE_HPP #define PYBNESIAN_KDE_PRODUCTKDE_HPP -#include #include #include #include #include +#include + using opencl::OpenCLConfig, opencl::OpenCL_kernel_traits; namespace kde { @@ -184,7 +185,7 @@ void ProductKDE::_fit(const DataFrame& df) { m_training.push_back(opencl.copy_to_buffer(column->data(), N)); } } - + // -1/2 * d * log(2 * pi) - 1/2 * log(|h|) - log(N) m_lognorm_const = -0.5 * static_cast(m_variables.size()) * std::log(2 * util::pi) - 0.5 * m_bandwidth.array().log().sum() - std::log(N); } diff --git a/pybnesian/kde/ScottsBandwidth.hpp b/pybnesian/kde/ScottsBandwidth.hpp index fa30604d..bcae2189 100644 --- a/pybnesian/kde/ScottsBandwidth.hpp +++ b/pybnesian/kde/ScottsBandwidth.hpp @@ -5,13 +5,22 @@ namespace kde { class ScottsBandwidth : public BandwidthSelector { public: + /** + * @brief Public function for calculating the diagonal bandwidth matrix using Scott's Rule given the data and + * variables. + * + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return VectorXd(0); size_t valid_rows = df.valid_rows(variables); if (valid_rows <= 1) { std::stringstream ss; - ss << "Diagonal bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "ScottsBandwidth::diag_bandwidth -> Diagonal bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -26,17 +35,26 @@ class ScottsBandwidth : public BandwidthSelector { case Type::FLOAT: return diag_bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "ScottsBandwidth::diag_bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } - + /** + * @brief Public function for calculating the bandwidth matrix using Scott's Rule given the data and variables. + * + * @param df Data + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return MatrixXd(0, 0); size_t valid_rows = df.valid_rows(variables); if (valid_rows <= variables.size()) { std::stringstream ss; - ss << "Bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "ScottsBandwidth::bandwidth -> Bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -51,7 +69,9 @@ class ScottsBandwidth : public BandwidthSelector { case Type::FLOAT: return bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "ScottsBandwidth::bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } @@ -62,6 +82,15 @@ class ScottsBandwidth : public BandwidthSelector { static std::shared_ptr __setstate__(py::tuple&) { return std::make_shared(); } private: + /** + * @brief Private function for calculating the diagonal bandwidth matrix using Scott's Rule given the data and + * variables + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ template VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; @@ -86,7 +115,14 @@ class ScottsBandwidth : public BandwidthSelector { return bandwidth; } - + /** + * @brief Private function for calculating the bandwidth matrix using Scott's Rule given the data and variables. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ template MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; @@ -106,6 +142,7 @@ class ScottsBandwidth : public BandwidthSelector { auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); + // Scott's Rule formula auto k = std::pow(N, -2. / (d + 4)); if constexpr (std::is_same_v) { diff --git a/pybnesian/learning/scores/cv_likelihood.cpp b/pybnesian/learning/scores/cv_likelihood.cpp index 19344f6c..b7086f3c 100644 --- a/pybnesian/learning/scores/cv_likelihood.cpp +++ b/pybnesian/learning/scores/cv_likelihood.cpp @@ -8,15 +8,27 @@ double CVLikelihood::local_score(const BayesianNetworkBase& model, return local_score(model, model.underlying_node_type(m_cv.data(), variable), variable, evidence); } +/** + * @brief Calculates the local score (cross-validated log-likelihood) of a variable given the evidence. + * NOTE: Requires fitting a cpd for each fold. + * @param model + * @param variable_type + * @param variable + * @param evidence + * @return double + */ double CVLikelihood::local_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, const std::vector& evidence) const { auto [args, kwargs] = m_arguments.args(variable, variable_type); + // Initialize the CPD auto cpd = variable_type->new_factor(model, variable, evidence, args, kwargs); + // Calculates the log-likelihood for each fold double loglik = 0; for (auto [train_df, test_df] : m_cv.loc(variable, evidence)) { + // NOTE: This fit fails if the Covariance matrix is not positive definite cpd->fit(train_df); loglik += cpd->slogl(test_df); } diff --git a/pybnesian/learning/scores/cv_likelihood.hpp b/pybnesian/learning/scores/cv_likelihood.hpp index 6b481e73..87d6d043 100644 --- a/pybnesian/learning/scores/cv_likelihood.hpp +++ b/pybnesian/learning/scores/cv_likelihood.hpp @@ -11,6 +11,11 @@ using models::BayesianNetworkBase, models::BayesianNetworkType; namespace learning::scores { +/** + * @brief This class implements an estimation of the log-likelihood on unseen data using k-fold cross validation over + * the data. + * + */ class CVLikelihood : public Score { public: CVLikelihood(const DataFrame& df, diff --git a/pybnesian/learning/scores/holdout_likelihood.hpp b/pybnesian/learning/scores/holdout_likelihood.hpp index c7b9dfa6..8ddd1d8a 100644 --- a/pybnesian/learning/scores/holdout_likelihood.hpp +++ b/pybnesian/learning/scores/holdout_likelihood.hpp @@ -12,6 +12,11 @@ using models::GaussianNetwork, models::SemiparametricBN; namespace learning::scores { +/** + * @brief This class implements an estimation of the log-likelihood on unseen data using a holdout dataset. Thus, the + * parameters are estimated using training data, and the score is estimated in the holdout data. + * + */ class HoldoutLikelihood : public Score { public: HoldoutLikelihood(const DataFrame& df, diff --git a/pybnesian/learning/scores/validated_likelihood.hpp b/pybnesian/learning/scores/validated_likelihood.hpp index 4bb98d45..e7dd7456 100644 --- a/pybnesian/learning/scores/validated_likelihood.hpp +++ b/pybnesian/learning/scores/validated_likelihood.hpp @@ -8,7 +8,15 @@ using learning::scores::ValidatedScore, learning::scores::HoldoutLikelihood, learning::scores::CVLikelihood; namespace learning::scores { - +/** + * @brief This class mixes the functionality of CVLikelihood and HoldoutLikelihood. First, it applies a HoldOut split + over the data. Then: + - It estimates the training score using a CVLikelihood over the training data. + - It estimates the validation score using the training data to estimate the parameters and calculating the + log-likelihood on the holdout data. + + * + */ class ValidatedLikelihood : public ValidatedScore { public: ValidatedLikelihood(const DataFrame& df, @@ -26,7 +34,15 @@ class ValidatedLikelihood : public ValidatedScore { const std::vector& parents) const override { return m_cv.local_score(model, variable, parents); } - + /** + * @brief Calculates the cross-validated log-likelihood of a variable given its parents. + * + * @param model + * @param variable_type + * @param variable + * @param parents + * @return double + */ double local_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, @@ -51,7 +67,15 @@ class ValidatedLikelihood : public ValidatedScore { const std::vector& evidence) const override { return m_holdout.local_score(model, variable, evidence); } - + /** + * @brief Calculates the validated local score of a variable given the evidence. + * + * @param model BayesianNetworkBase + * @param variable_type FactorType + * @param variable the variable name + * @param evidence the evidence vector + * @return double the validated local score + */ double vlocal_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, diff --git a/pybnesian/models/BayesianNetwork.hpp b/pybnesian/models/BayesianNetwork.hpp index 830df3b6..8f9b492e 100644 --- a/pybnesian/models/BayesianNetwork.hpp +++ b/pybnesian/models/BayesianNetwork.hpp @@ -62,6 +62,12 @@ class BayesianNetworkBase : public std::enable_shared_from_thiscan_have_arc(*this, target, source); } - + /** + * @brief Include the given whitelisted arcs. It checks the validity of the graph after including the arc whitelist. + * + * @param arc_whitelist List of arcs to add. + */ void force_whitelist(const ArcStringVector& arc_whitelist) override { for (const auto& arc : arc_whitelist) { if (!has_arc(arc.first, arc.second)) { - if (has_arc(arc.second, arc.first)) { + if (has_arc(arc.second, arc.first)) { // Check if the reverse arc is present throw std::invalid_argument("Arc " + arc.first + " -> " + arc.second + " in whitelist," " but arc " + arc.second + " -> " + arc.first + " is present" " in the Bayesian Network."); - } else if (can_add_arc(arc.first, arc.second)) { + } else if (can_add_arc(arc.first, arc.second)) { // Check if the arc can be added add_arc_unsafe(arc.first, arc.second); - } else { + } else { // Check if the arc can be flipped throw std::invalid_argument("Arc " + arc.first + " -> " + arc.second + " not allowed in this Bayesian network."); } diff --git a/pybnesian/opencl/opencl_config.hpp b/pybnesian/opencl/opencl_config.hpp index 565b7880..220818c9 100644 --- a/pybnesian/opencl/opencl_config.hpp +++ b/pybnesian/opencl/opencl_config.hpp @@ -515,18 +515,31 @@ void OpenCLConfig::reduction_cols_offset( } template +/** + * @brief Calculates the log(sum(exp(.))) of each column of a matrix. + * + * @param input_mat Matrix of size input_rows x input_cols + * @param input_rows Number of rows of the matrix + * @param input_cols Number of columns of the matrix + * @param output_vec Vector of size input_cols + * @param output_offset Offset of the output vector + */ void OpenCLConfig::logsumexp_cols_offset( cl::Buffer& input_mat, int input_rows, int input_cols, cl::Buffer& output_vec, int output_offset) { auto max_buffer = amax_cols(input_mat, input_rows, input_cols); + // exp(input_mat[idx] - max_buffer[col]); auto logsumexp_coeffs = kernel(OpenCL_kernel_traits::logsumexp_coeffs); logsumexp_coeffs.setArg(0, input_mat); logsumexp_coeffs.setArg(1, static_cast(input_rows)); logsumexp_coeffs.setArg(2, max_buffer); RAISE_ENQUEUEKERNEL_ERROR(m_queue.enqueueNDRangeKernel( logsumexp_coeffs, cl::NullRange, cl::NDRange(input_rows * input_cols), cl::NullRange)); + + // sum(exp(input_mat[idx] - max_buffer[col])); sum_cols_offset(input_mat, input_rows, input_cols, output_vec, static_cast(output_offset)); + // log(sum(exp(input_mat[idx] - max_buffer[col]))) + max_buffer[col]; auto finish_lse = kernel(OpenCL_kernel_traits::finish_lse_offset); finish_lse.setArg(0, output_vec); finish_lse.setArg(1, static_cast(output_offset)); From dd0a4af300e754b4a5a6fc91de0985c295c4841f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 15:23:35 +0200 Subject: [PATCH 11/75] more comments --- pybnesian/kde/KDE.cpp | 6 +- pybnesian/kde/KDE.hpp | 4 +- pybnesian/kde/NormalReferenceRule.hpp | 2 +- pybnesian/kde/ProductKDE.hpp | 1 - .../learning/algorithms/hillclimbing.cpp | 38 +++++- .../learning/algorithms/hillclimbing.hpp | 120 ++++++++++++++++-- pybnesian/learning/operators/operators.cpp | 47 +++++-- pybnesian/learning/operators/operators.hpp | 32 ++++- 8 files changed, 220 insertions(+), 30 deletions(-) diff --git a/pybnesian/kde/KDE.cpp b/pybnesian/kde/KDE.cpp index 6271e3ab..bb35a9fe 100644 --- a/pybnesian/kde/KDE.cpp +++ b/pybnesian/kde/KDE.cpp @@ -39,7 +39,11 @@ DataFrame KDE::training_data() const { throw std::invalid_argument("Unreachable code."); } } - +/** + * @brief Learns the KDE parameters from the given data. + * + * @param df Data. + */ void KDE::fit(const DataFrame& df) { m_training_type = df.same_type(m_variables); diff --git a/pybnesian/kde/KDE.hpp b/pybnesian/kde/KDE.hpp index 0604e766..b13fc627 100644 --- a/pybnesian/kde/KDE.hpp +++ b/pybnesian/kde/KDE.hpp @@ -573,9 +573,9 @@ void KDE::_fit(const DataFrame& df) { using CType = typename ArrowType::c_type; auto d = m_variables.size(); - + // NOTE: Here the positive definiteness of the bandwidth is checked m_bandwidth = m_bselector->bandwidth(df, m_variables); - + // Calculates the LLT decomposition matrix of the bandwidth matrix auto llt_cov = m_bandwidth.llt(); auto cholesky = llt_cov.matrixLLT(); diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index 7b93a213..cf96e88e 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -165,7 +165,7 @@ class NormalReferenceRule : public BandwidthSelector { auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); - + // Normal Reference Rule formula squared for the bandwidth auto k = std::pow(4. / (N * (d + 2.)), 2. / (d + 4)); if constexpr (std::is_same_v) { diff --git a/pybnesian/kde/ProductKDE.hpp b/pybnesian/kde/ProductKDE.hpp index f666747e..33d2f086 100644 --- a/pybnesian/kde/ProductKDE.hpp +++ b/pybnesian/kde/ProductKDE.hpp @@ -5,7 +5,6 @@ #include #include #include - #include using opencl::OpenCLConfig, opencl::OpenCL_kernel_traits; diff --git a/pybnesian/learning/algorithms/hillclimbing.cpp b/pybnesian/learning/algorithms/hillclimbing.cpp index 130bbadf..883a679a 100644 --- a/pybnesian/learning/algorithms/hillclimbing.cpp +++ b/pybnesian/learning/algorithms/hillclimbing.cpp @@ -23,6 +23,35 @@ using util::ArcStringVector; namespace learning::algorithms { +/** + * @brief Executes a greedy hill-climbing algorithm for Bayesian network structure learning. This calls + GreedyHillClimbing.estimate(). + * + * @param df DataFrame used to learn a Bayesian network model. + * @param bn_type BayesianNetworkType of the returned model. If start is given, bn_type is ignored. Defaults to + * pbn.SemiparametricBNType(). + * @param start Initial structure of the GreedyHillClimbing. If None, a new Bayesian network model is created. Defaults + * to None. + * @param score_str A string representing the score used to drive the search. + The possible options are: “bic” for BIC, “bge” for BGe, “cv-lik” for CVLikelihood, “holdout-lik” for + HoldoutLikelihood, “validated-lik for ValidatedLikelihood. Defaults to "validated-lik". + * @param operators_str Set of operators in the search process. Defaults to ["arcs", "node_type"]. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). Defaults to []. + * @param arc_whitelist List of arcs whitelist (forced arcs). Defaults to []. + * @param type_blacklist List of type blacklist (forbidden types). Defaults to []. + * @param type_whitelist List of type whitelist (forced types). Defaults to []. + * @param callback Callback object that is called after each iteration. Defaults to None. + * @param max_indegree Maximum indegree allowed in the graph. Defaults to 0. + * @param max_iters Maximum number of search iterations. Defaults to 2147483647. + * @param epsilon Minimum delta score allowed for each operator. If the new operator is less than epsilon, the search + process is stopped. Defaults to 0. + * @param patience The patience parameter (only used with pbn.ValidatedScore). Defaults to 0. + * @param seed Seed parameter of the score (if needed). Defaults to None. + * @param num_folds Number of folds for the CVLikelihood and ValidatedLikelihood scores. Defaults to 10. + * @param test_holdout_ratio Parameter for the HoldoutLikelihood and ValidatedLikelihood scores. Defaults to 0.2. + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. Defaults to 0. + * @return std::shared_ptr The estimated Bayesian network structure. + */ std::shared_ptr hc(const DataFrame& df, const std::shared_ptr bn_type, const std::shared_ptr start, @@ -44,7 +73,7 @@ std::shared_ptr hc(const DataFrame& df, if (!bn_type && !start) { throw std::invalid_argument("\"bn_type\" or \"start\" parameter must be specified."); } - + // If seed is not given, it is set to a random value. auto iseed = [seed]() { if (seed) return *seed; @@ -52,6 +81,7 @@ std::shared_ptr hc(const DataFrame& df, return std::random_device{}(); }(); + // If bn_type is not given, it is set to the type of the given start model. const auto& bn_type_ = [&start, &bn_type]() -> const BayesianNetworkType& { if (start) return start->type_ref(); @@ -59,11 +89,14 @@ std::shared_ptr hc(const DataFrame& df, return *bn_type; }(); + // Checks if the given operators are valid for the given Bayesian network type ["arcs", "node_type"]. auto operators = util::check_valid_operators( bn_type_, operators_str, arc_blacklist, arc_whitelist, max_indegree, type_whitelist); + // If max_iters is 0, it is set to the maximum integer value. if (max_iters == 0) max_iters = std::numeric_limits::max(); + // If start is given, it is used as the initial model. Otherwise, a new model is created. const auto start_model = [&start, &bn_type_, &df]() -> const std::shared_ptr { if (start) return start; @@ -72,8 +105,9 @@ std::shared_ptr hc(const DataFrame& df, }(); GreedyHillClimbing hc; - auto score = util::check_valid_score(df, bn_type_, score_str, iseed, num_folds, test_holdout_ratio); + // If score is not given, it is set to the default score for the given Bayesian network type. + auto score = util::check_valid_score(df, bn_type_, score_str, iseed, num_folds, test_holdout_ratio); return hc.estimate(*operators, *score, *start_model, diff --git a/pybnesian/learning/algorithms/hillclimbing.hpp b/pybnesian/learning/algorithms/hillclimbing.hpp index a98900cf..50141eb8 100644 --- a/pybnesian/learning/algorithms/hillclimbing.hpp +++ b/pybnesian/learning/algorithms/hillclimbing.hpp @@ -42,7 +42,16 @@ std::shared_ptr hc(const DataFrame& df, int num_folds, double test_holdout_ratio, int verbose = 0); - +/** + * @brief Calculates the validation delta score for each of the variables. + * + * @tparam T Type of the Bayesian network. + * @param model Bayesian network. + * @param val_score Validated score. + * @param variables List of variables. + * @param current_local_scores Local score cache. + * @return double The validation delta score. + */ template double validation_delta_score(const T& model, const ValidatedScore& val_score, @@ -58,7 +67,28 @@ double validation_delta_score(const T& model, return nnew - prev; } - +/** + * @brief Executes a greedy hill-climbing algorithm for Bayesian network structure learning. + * + * @tparam zero_patience True if patience == 0, False otherwise. + * @tparam S Type of the score. + * @tparam T Type of the Bayesian network. + * @param op_set Set of operators in the search process. + * @param score Score that drives the search. + * @param start Initial structure. A BayesianNetworkBase or ConditionalBayesianNetworkBase. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). + * @param arc_whitelist List of arcs whitelist (forced arcs). + * @param type_blacklist List of type blacklist (forbidden pbn.FactorType). + * @param type_whitelist List of type whitelist (forced pbn.FactorType). + * @param callback Callback object that is called after each iteration. + * @param max_indegree Maximum indegree allowed in the graph. + * @param max_iters Maximum number of search iterations. + * @param epsilon Minimum delta score allowed for each operator. If (best_op->delta() - epsilon) < util::machine_tol, + * then the search process is stopped. + * @param patience The patience parameter (only used with ValidatedScore). + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. + * @return std::shared_ptr The estimated Bayesian network structure of the same type as start. + */ template std::shared_ptr estimate_hc(OperatorSet& op_set, S& score, @@ -73,34 +103,41 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, double epsilon, int patience, int verbose) { + // Spinner for the progress bar auto spinner = util::indeterminate_spinner(verbose); spinner->update_status("Checking dataset..."); + // Model initialization auto current_model = start.clone(); + // Model type validation current_model->force_type_whitelist(type_whitelist); - if (current_model->has_unknown_node_types()) { auto score_data = score.data(); if (score_data->num_columns() == 0) { throw std::invalid_argument( "The score does not have data to detect the node types. Set the node types for" - " all the nodes in the Bayesian network or use an score that uses data (it implements Score::data)."); + " all the nodes in the Bayesian network or use an score that uses data (it implements " + "Score::data)."); } score_data.raise_has_columns(current_model->nodes()); current_model->set_unknown_node_types(score_data, type_blacklist); } +// Model arc validation + current_model->check_blacklist( + arc_blacklist); // Checks whether the arc_blacklist is valid for the current_model + current_model->force_whitelist(arc_whitelist); // Include the given whitelisted arcs. It checks the validity of + // the graph after including the arc whitelist. - current_model->check_blacklist(arc_blacklist); - current_model->force_whitelist(arc_whitelist); - + // OperatorSet initialization op_set.set_arc_blacklist(arc_blacklist); op_set.set_arc_whitelist(arc_whitelist); op_set.set_type_blacklist(type_blacklist); op_set.set_type_whitelist(type_whitelist); op_set.set_max_indegree(max_indegree); + // Search model initialization auto prev_current_model = current_model->clone(); auto best_model = current_model; @@ -171,10 +208,11 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, if (p == 0) best_model = prev_current_model->clone(); if (++p > patience) break; accumulated_offset += validation_delta; - tabu_set.insert(best_op->opposite(*current_model)); + tabu_set.insert(best_op->opposite(*current_model)); // Add the opposite operator to the tabu set } } + // Updates the previous current model best_op->apply(*prev_current_model); if (callback) callback->call(*current_model, best_op.get(), score, iter); @@ -188,7 +226,8 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, } else { static_assert(util::always_false, "Wrong Score class for hill-climbing."); } - } + + } // End of Hill climbing iterations op_set.finished(); @@ -197,7 +236,26 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, spinner->mark_as_completed("Finished Hill-climbing!"); return best_model; } - +/** + * @brief Depending on the validated_score and the patience of the hill climbing algorithm it estimates the + * structure of the Bayesian network. + * + * @tparam T + * @param op_set + * @param score + * @param start + * @param arc_blacklist + * @param arc_whitelist + * @param type_blacklist + * @param type_whitelist + * @param callback + * @param max_indegree + * @param max_iters + * @param epsilon + * @param patience + * @param verbose + * @return std::shared_ptr + */ template std::shared_ptr estimate_downcast_score(OperatorSet& op_set, Score& score, @@ -274,7 +332,25 @@ std::shared_ptr estimate_downcast_score(OperatorSet& op_set, } } } - +/** + * @brief Checks the parameters of the hill climbing algorithm and estimates the structure of a Bayesian network. + * + * @tparam T + * @param op_set + * @param score + * @param start + * @param arc_blacklist + * @param arc_whitelist + * @param type_blacklist + * @param type_whitelist + * @param callback + * @param max_indegree + * @param max_iters + * @param epsilon + * @param patience + * @param verbose + * @return std::shared_ptr + */ template std::shared_ptr estimate_checks(OperatorSet& op_set, Score& score, @@ -313,6 +389,28 @@ std::shared_ptr estimate_checks(OperatorSet& op_set, class GreedyHillClimbing { public: + /** + * @brief Estimates the structure of a Bayesian network. The estimated Bayesian network is of the same type as + * start. The set of operators allowed in the search is operators. The delta score of each operator is evaluated + * using the score. The initial structure of the algorithm is the model start. + * + * @tparam T Type of the Bayesian network. + * @param op_set Set of operators in the search process. + * @param score pbn.core that drives the search. + * @param start Initial structure. A BayesianNetworkBase or ConditionalBayesianNetworkBase. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). + * @param arc_whitelist List of arcs whitelist (forced arcs). + * @param type_blacklist List of type blacklist (forbidden pbn.FactorType). + * @param type_whitelist List of type whitelist (forced pbn.FactorType). + * @param callback Callback object that is called after each iteration. + * @param max_indegree Maximum indegree allowed in the graph. + * @param max_iters Maximum number of search iterations. + * @param epsilon Minimum delta score allowed for each operator. If the new operator is less than epsilon, the + * search process is stopped. + * @param patience he patience parameter (only used with pbn.ValidatedScore). + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. + * @return std::shared_ptr The estimated Bayesian network structure of the same type as start. + */ template std::shared_ptr estimate(OperatorSet& op_set, Score& score, diff --git a/pybnesian/learning/operators/operators.cpp b/pybnesian/learning/operators/operators.cpp index 826c512e..338ca958 100644 --- a/pybnesian/learning/operators/operators.cpp +++ b/pybnesian/learning/operators/operators.cpp @@ -16,6 +16,12 @@ std::shared_ptr AddArc::opposite(const ConditionalBayesianNetworkBase& return opposite(static_cast(m)); } +/** + * @brief Updates the valid operations matrix and the delta matrix. + * The idea is that arc_whitelist and arc_blacklist are operations that have to be ignored. + * + * @param model BayesianNetwork. + */ void ArcOperatorSet::update_valid_ops(const BayesianNetworkBase& model) { int num_nodes = model.num_nodes(); @@ -96,6 +102,12 @@ double cache_score_operation(const BayesianNetworkBase& model, return d; } } +/** + * @brief Cache scores for the given BayesianNetwork and ArcOperator score. + * + * @param model BayesianNetwork. + * @param score Score. + */ void ArcOperatorSet::cache_scores(const BayesianNetworkBase& model, const Score& score) { if (!score.compatible_bn(model)) { @@ -108,16 +120,18 @@ void ArcOperatorSet::cache_scores(const BayesianNetworkBase& model, const Score& this->m_local_cache->cache_local_scores(model, score); } - update_valid_ops(model); + update_valid_ops(model); // Updates a matrix of valid operations and a matrix of delta scores. auto bn_type = model.type(); - for (const auto& target_node : model.nodes()) { + for (const auto& target_node : model.nodes()) { // Iterates over all target_node in the model. std::vector new_parents_target = model.parents(target_node); int target_collapsed = model.collapsed_index(target_node); - for (const auto& source_node : model.nodes()) { + for (const auto& source_node : model.nodes()) { // Iterates over all source_node in the model. int source_collapsed = model.collapsed_index(source_node); if (valid_op(source_collapsed, target_collapsed) && - bn_type->can_have_arc(model, source_node, target_node)) { + bn_type->can_have_arc( + model, source_node, target_node)) { // If the arc operation (source_node, target_node) is valid. + // NOTE: FIXED Here the score is calculated and may fail if the covariance matrix is singular. delta(source_collapsed, target_collapsed) = cache_score_operation(model, score, @@ -209,7 +223,13 @@ void ArcOperatorSet::update_valid_ops(const ConditionalBayesianNetworkBase& mode } } } - +/** + * @brief Cache scores for the given ConditionalBayesianNetwork and ArcOperator score. + * + * @param model BayesianNetwork. + * @param score Score. + */ +// TODO: Update ConditionalBayesianNetworkBase for singular covariance? void ArcOperatorSet::cache_scores(const ConditionalBayesianNetworkBase& model, const Score& score) { if (!score.compatible_bn(model)) { throw std::invalid_argument("BayesianNetwork is not compatible with the score."); @@ -292,12 +312,18 @@ std::shared_ptr ArcOperatorSet::find_max(const ConditionalBayesianNetw else return find_max_indegree(model, tabu_set); } - +/** + * @brief Find the maximum operation for the given BayesianNetwork and ArcOperatorSet score. + * + * @param model + * @param score + * @param target_node + */ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& model, const Score& score, const std::string& target_node) { auto target_collapsed = model.collapsed_index(target_node); - auto parents = model.parents(target_node); + auto parents = model.parents(target_node); // The parents of the target_node auto bn_type = model.type(); for (const auto& source_node : model.nodes()) { @@ -435,7 +461,12 @@ void ArcOperatorSet::update_scores(const ConditionalBayesianNetworkBase& model, update_incoming_arcs_scores(model, score, n); } } - +/** + * @brief Cache scores for the given BayesianNetwork and ChangeNodeTypeSet score. + * + * @param model BayesianNetwork. + * @param score Score. + */ void ChangeNodeTypeSet::cache_scores(const BayesianNetworkBase& model, const Score& score) { if (model.type_ref().is_homogeneous()) { throw std::invalid_argument("ChangeNodeTypeSet can only be used with non-homogeneous Bayesian networks."); diff --git a/pybnesian/learning/operators/operators.hpp b/pybnesian/learning/operators/operators.hpp index a624c830..025d4ddc 100644 --- a/pybnesian/learning/operators/operators.hpp +++ b/pybnesian/learning/operators/operators.hpp @@ -21,7 +21,7 @@ namespace learning::operators { class Operator { public: Operator(double delta) : m_delta(delta) {} - virtual ~Operator(){}; + virtual ~Operator() {}; virtual bool is_python_derived() const { return false; } @@ -292,26 +292,43 @@ class OperatorTabuSet { SetType m_set; }; +/** + * @brief Cache of local scores for each node in the network. + * + */ class LocalScoreCache { public: LocalScoreCache() : m_local_score() {} LocalScoreCache(const BayesianNetworkBase& m) : m_local_score(m.num_nodes()) {} + /** + * @brief Cache local scores for each node in the network. + * + * @param model Bayesian network + * @param score Score + */ void cache_local_scores(const BayesianNetworkBase& model, const Score& score) { + // Checks if the cache has the right size if (m_local_score.rows() != model.num_nodes()) { m_local_score = VectorXd(model.num_nodes()); } - + // Caches the local score for each node for (const auto& node : model.nodes()) { m_local_score(model.collapsed_index(node)) = score.local_score(model, node); } } - + /** + * @brief Cache Validated local scores for each node in the network. + * + * @param model Bayesian network + * @param score Validated score + */ void cache_vlocal_scores(const BayesianNetworkBase& model, const ValidatedScore& score) { + // Checks if the cache has the right size if (m_local_score.rows() != model.num_nodes()) { m_local_score = VectorXd(model.num_nodes()); } - + // Caches the validated local score for each node for (const auto& node : model.nodes()) { m_local_score(model.collapsed_index(node)) = score.vlocal_score(model, node); } @@ -833,6 +850,13 @@ class OperatorPool : public OperatorSet { std::vector> m_op_sets; }; +/** + * @brief Cache local scores for each of the operators in the pool with the given model and score. + * + * @tparam M Model type + * @param model Bayesian network + * @param score Score + */ template void OperatorPool::cache_scores(const M& model, const Score& score) { if (!this->m_local_cache) { From 297966a3904a0c5032034f974a45670dacb14694 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 15:48:23 +0200 Subject: [PATCH 12/75] scotts and normal reference bandwidth recoded --- pybnesian/kde/NormalReferenceRule.hpp | 18 +++++++++--------- pybnesian/kde/ScottsBandwidth.hpp | 10 +++++----- pybnesian/learning/algorithms/hillclimbing.hpp | 13 +++++++------ 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index cf96e88e..da640c13 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -130,12 +130,12 @@ class NormalReferenceRule : public BandwidthSelector { // [4*d*sqrt(det(delta))] / // / [(2*trace(delta^(-1)*delta^(-1)) + trace(delta^(-1))^2) * N] auto k = 4 * d * std::sqrt(delta.determinant()) / - (2 * (delta_inv * delta_inv).trace() + delta_inv_trace * delta_inv_trace); - + ((2 * (delta_inv * delta_inv).trace() + delta_inv_trace * delta_inv_trace) * N); + auto k2 = std::pow(k, 2. / (d + 4.)); if constexpr (std::is_same_v) { - return std::pow(k / N, 2. / (d + 4.)) * diag; + return k2 * diag; } else { - return (std::pow(k / N, 2. / (d + 4.)) * diag).template cast(); + return (k2 * diag).template cast(); } } /** @@ -151,9 +151,9 @@ class NormalReferenceRule : public BandwidthSelector { MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; - auto cov = df.cov(variables); - - if (!util::is_psd(*cov)) { + auto cov_ptr = df.cov(variables); + auto& cov = *cov_ptr; + if (!util::is_psd(cov)) { std::stringstream ss; ss << "Covariance matrix for variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { @@ -169,9 +169,9 @@ class NormalReferenceRule : public BandwidthSelector { auto k = std::pow(4. / (N * (d + 2.)), 2. / (d + 4)); if constexpr (std::is_same_v) { - return k * (*cov); + return k * cov; } else { - return k * cov->template cast(); + return (k * cov).template cast(); } } }; diff --git a/pybnesian/kde/ScottsBandwidth.hpp b/pybnesian/kde/ScottsBandwidth.hpp index bcae2189..99c686d3 100644 --- a/pybnesian/kde/ScottsBandwidth.hpp +++ b/pybnesian/kde/ScottsBandwidth.hpp @@ -127,9 +127,9 @@ class ScottsBandwidth : public BandwidthSelector { MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; - auto cov = df.cov(variables); - - if (!util::is_psd(*cov)) { + auto cov_ptr = df.cov(variables); + auto& cov = *cov_ptr; + if (!util::is_psd(cov)) { std::stringstream ss; ss << "Covariance matrix for variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { @@ -146,9 +146,9 @@ class ScottsBandwidth : public BandwidthSelector { auto k = std::pow(N, -2. / (d + 4)); if constexpr (std::is_same_v) { - return k * (*cov); + return k * cov; } else { - return k * cov->template cast(); + return (k * cov).template cast(); } } }; diff --git a/pybnesian/learning/algorithms/hillclimbing.hpp b/pybnesian/learning/algorithms/hillclimbing.hpp index 50141eb8..2c164dbb 100644 --- a/pybnesian/learning/algorithms/hillclimbing.hpp +++ b/pybnesian/learning/algorithms/hillclimbing.hpp @@ -143,12 +143,12 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, spinner->update_status("Caching scores..."); - LocalScoreCache local_validation = [&]() { - if constexpr (std::is_base_of_v) { - LocalScoreCache lc(*current_model); - lc.cache_vlocal_scores(*current_model, score); + LocalScoreCache local_validation = [&]() { // Local validation scores (lambda expression) + if constexpr (std::is_base_of_v) { // If the score is a ValidatedScore + LocalScoreCache lc(*current_model); // Local score cache + lc.cache_vlocal_scores(*current_model, score); // Cache the local scores return lc; - } else if constexpr (std::is_base_of_v) { + } else if constexpr (std::is_base_of_v) { // If the score is a generic Score return LocalScoreCache{}; } else { static_assert(util::always_false, "Wrong Score class for hill-climbing."); @@ -220,7 +220,8 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, op_set.update_scores(*current_model, score, nodes_changed); if constexpr (std::is_base_of_v) { - spinner->update_status(best_op->ToString() + " | Validation delta: " + std::to_string(validation_delta)); + spinner->update_status(best_op->ToString() + + " | Validation delta: " + std::to_string(validation_delta)); } else if constexpr (std::is_base_of_v) { spinner->update_status(best_op->ToString()); } else { From 67b83cee66e9ce59499ee4b1b6c96d33a8c8069c Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 16:05:21 +0200 Subject: [PATCH 13/75] operators operations reordered --- pybnesian/learning/operators/operators.cpp | 119 +++++++++++++++++---- 1 file changed, 97 insertions(+), 22 deletions(-) diff --git a/pybnesian/learning/operators/operators.cpp b/pybnesian/learning/operators/operators.cpp index 338ca958..2af47918 100644 --- a/pybnesian/learning/operators/operators.cpp +++ b/pybnesian/learning/operators/operators.cpp @@ -330,41 +330,116 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode auto source_collapsed = model.collapsed_index(source_node); if (valid_op(source_collapsed, target_collapsed)) { - if (model.has_arc(source_node, target_node)) { - // Update remove arc: source_node -> target_node - util::swap_remove_v(parents, source_node); - double d = score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, target_node); - parents.push_back(source_node); - delta(source_collapsed, target_collapsed) = d; - - // Update flip arc: source_node -> target_node + // ARC FLIPPING source_node -> target_node to target_node -> source_node: + if (model.has_arc(source_node, + target_node)) { // If the arc source_node -> target_node already exists, remove it and + // then put the reverse arc if possible. + // util::formatted_log_t(verbose, log_str + "model.has_arc(source_node, target_node) TBC"); + util::swap_remove_v(parents, source_node); // Remove source_node from the parents of target_node + // score of removing (source_collapsed -> target_node) + double d = score.local_score(model, target_node, parents) - // New score with the removed arc + this->m_local_cache->local_score(model, target_node); // Old score with the arc + parents.push_back(source_node); // Readd source_node to the parents of target_node + delta(source_collapsed, target_collapsed) = d; // score of removing (source_collapsed -> target_node) + + // Update flip arc: source_node -> target_node to target_node -> source_node if (valid_op(target_collapsed, source_collapsed) && - bn_type->can_have_arc(model, target_node, source_node)) { + bn_type->can_have_arc( + model, target_node, source_node)) { // If the reverse arc (target_node -> source_node) is + // possible, then put the reverse arc + + // util::formatted_log_t(verbose, + // log_str + + // "valid_op(target_collapsed, source_collapsed) " + // "bn_type->can_have_arc(model, target_node, source_node) TBC"); auto parents_source = model.parents(source_node); parents_source.push_back(target_node); - delta(target_collapsed, source_collapsed) = d + - score.local_score(model, source_node, parents_source) - - this->m_local_cache->local_score(model, source_node); + double d2; + // try { + // score of adding (target_node -> source_collapsed) + d2 = d + score.local_score(model, source_node, parents_source) - // New score with the added arc + this->m_local_cache->local_score(model, source_node); // Old score without the arc + // } catch (const util::singular_covariance_data& e) { + // util::formatted_log_t(verbose, log_str + e.what()); + // d2 = std::numeric_limits::lowest(); + + // valid_op(source_collapsed, target_collapsed) = false; + // valid_op(target_collapsed, source_collapsed) = false; + + // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); + // } + delta(target_collapsed, source_collapsed) = + d2; // score of reversing (source_collapsed -> target_node) to (target_node -> + // source_collapsed) } } else if (model.has_arc(target_node, source_node) && - bn_type->can_have_arc(model, source_node, target_node)) { - // Update flip arc: target_node -> source_node + bn_type->can_have_arc( + model, + source_node, + target_node)) { // ARC FLIPPING target_node -> source_node to source_node -> target_node: + // If the arc target_node -> source_node already exists and the reverse arc + // is possible, then put the flip the arc to source_node -> target_node. + // util::formatted_log_t(verbose, + // log_str + + // "model.has_arc(target_node, source_node) bn_type->can_have_arc(model, " + // "source_node, target_node) TBC"); auto parents_source = model.parents(source_node); - util::swap_remove_v(parents_source, target_node); + util::swap_remove_v(parents_source, target_node); // Remove target_node from the parents of source_node parents.push_back(source_node); - double d = score.local_score(model, source_node, parents_source) + - score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, source_node) - - this->m_local_cache->local_score(model, target_node); + + // Update flip arc score: target_node -> source_node to source_node -> target_node + double d; + // try { + d = score.local_score(model, + target_node, + parents) + // New score after adding source_node as parent of target_node + score.local_score( + model, + source_node, + parents_source) - // New score after removing target_node as parent of source_node + this->m_local_cache->local_score(model, target_node) - + this->m_local_cache->local_score(model, source_node); + + // } catch (const util::singular_covariance_data& e) { + // // In case singular covariance data is found, the operation is marked as invalid in both arc + // // directions and the delta is set to the lowest possible value + // (ArcOperatorSet::update_valid_ops). util::formatted_log_t(verbose, log_str + e.what()); d = + // std::numeric_limits::lowest(); + + // valid_op(source_collapsed, target_collapsed) = false; + // valid_op(target_collapsed, source_collapsed) = false; + // delta(source_collapsed, target_collapsed) = d; + // delta(target_collapsed, source_collapsed) = d; + + // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); + // } + parents.pop_back(); + // TODO: Is necessary parents_source.push_back(target_node);? delta(source_collapsed, target_collapsed) = d; } else if (bn_type->can_have_arc(model, source_node, target_node)) { // Update add arc: source_node -> target_node + // util::formatted_log_t(verbose, log_str + "bn_type->can_have_arc(model, source_node, target_node) + // TBC"); parents.push_back(source_node); - double d = score.local_score(model, target_node, parents) - + double d; + // try { + d = score.local_score(model, target_node, parents) - this->m_local_cache->local_score(model, target_node); + // } catch (const util::singular_covariance_data& e) { + // // In case singular covariance data is found, the operation is marked as invalid in both arc + // // directions and the delta is set to the lowest possible value + // // (ArcOperatorSet::update_valid_ops). util::formatted_log_t(verbose, log_str + e.what()); + // d = std::numeric_limits::lowest(); + + // valid_op(source_collapsed, target_collapsed) = false; + // valid_op(target_collapsed, source_collapsed) = false; + // delta(source_collapsed, target_collapsed) = d; + // delta(target_collapsed, source_collapsed) = d; + + // // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); + // } parents.pop_back(); delta(source_collapsed, target_collapsed) = d; } @@ -392,7 +467,7 @@ void ArcOperatorSet::update_incoming_arcs_scores(const ConditionalBayesianNetwor const Score& score, const std::string& target_node) { auto target_collapsed = model.collapsed_index(target_node); - auto parents = model.parents(target_node); + auto parents = model.parents(target_node); // The parents of the target_node auto bn_type = model.type(); for (const auto& source_node : model.joint_nodes()) { From fd653a4593db02ed89f99d709710e012049d10df Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 16:34:37 +0200 Subject: [PATCH 14/75] pytest warning fixed --- pytest.ini | 2 +- tests/factors/continuous/KDE_test.py | 4 ++-- tests/factors/discrete/DiscreteFactor_test.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytest.ini b/pytest.ini index 31361951..99a3c717 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,3 @@ [pytest] testpaths = tests -norecursedirs=tests/helpers \ No newline at end of file +norecursedirs = tests/helpers \ No newline at end of file diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index dd5d9b9f..529ade49 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -2,10 +2,10 @@ import pyarrow as pa import pytest from scipy.stats import gaussian_kde -from util_test import generate_normal_data import pybnesian as pbn from pybnesian import BandwidthSelector +from util_test import generate_normal_data SIZE = 500 df = generate_normal_data(SIZE, seed=0) @@ -286,7 +286,7 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): * s.scotts_factor(), ) # We initialize the logl and scipy_logl columns with NaN - _test_df.loc["logl"] = np.nan + _test_df["logl"] = np.nan _test_df["scipy_logl"] = np.nan # We calculate the logl with the KDE factor diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 709fa925..4e1894bc 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -2,9 +2,9 @@ import pandas as pd import pyarrow as pa import pytest -from util_test import generate_normal_data import pybnesian as pbn +from util_test import generate_normal_data df = util_test.generate_discrete_data_dependent(10000) From 3daec5407a742c78f290278659086538543610a9 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 16:43:32 +0200 Subject: [PATCH 15/75] extra comments cleaned --- pybnesian/kde/KDE.hpp | 4 +- pybnesian/learning/operators/operators.cpp | 53 +--------------------- 2 files changed, 2 insertions(+), 55 deletions(-) diff --git a/pybnesian/kde/KDE.hpp b/pybnesian/kde/KDE.hpp index b13fc627..2071e873 100644 --- a/pybnesian/kde/KDE.hpp +++ b/pybnesian/kde/KDE.hpp @@ -89,7 +89,7 @@ void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, cl::Buffer&, cl::Buffer& output_mat) { auto& opencl = OpenCLConfig::get(); -// TODO: This is the kernel that is executed, might be wrong? + // TODO: This is the kernel that is executed, might be wrong? // OpenCL kernel for calculating the log-likelihood values for each test instance // __kernel void logl_values_1d_mat_double(__global double *restrict train_vector, // __private uint train_rows, @@ -658,7 +658,6 @@ VectorXd KDE::_logl(const DataFrame& df) const { auto logl_buff = logl_buffer(df); auto& opencl = OpenCLConfig::get(); - // TODO I don't understand how the log-likelihood is calculated if (df.null_count(m_variables) == 0) { // No null variables -> Returns the data? VectorType read_data(df->num_rows()); opencl.read_from_buffer(read_data.data(), logl_buff, df->num_rows()); @@ -745,7 +744,6 @@ cl::Buffer KDE::logl_buffer(const DataFrame& df, Buffer_ptr& bitmap) const { return _logl_impl(test_buffer, m); } -// TODO Check here the interesting part? /** * @brief Function where the log-likelihood are calculated with OpenCL?. * diff --git a/pybnesian/learning/operators/operators.cpp b/pybnesian/learning/operators/operators.cpp index 2af47918..f97f5fea 100644 --- a/pybnesian/learning/operators/operators.cpp +++ b/pybnesian/learning/operators/operators.cpp @@ -334,7 +334,6 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode if (model.has_arc(source_node, target_node)) { // If the arc source_node -> target_node already exists, remove it and // then put the reverse arc if possible. - // util::formatted_log_t(verbose, log_str + "model.has_arc(source_node, target_node) TBC"); util::swap_remove_v(parents, source_node); // Remove source_node from the parents of target_node // score of removing (source_collapsed -> target_node) double d = score.local_score(model, target_node, parents) - // New score with the removed arc @@ -347,27 +346,12 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode bn_type->can_have_arc( model, target_node, source_node)) { // If the reverse arc (target_node -> source_node) is // possible, then put the reverse arc - - // util::formatted_log_t(verbose, - // log_str + - // "valid_op(target_collapsed, source_collapsed) " - // "bn_type->can_have_arc(model, target_node, source_node) TBC"); auto parents_source = model.parents(source_node); parents_source.push_back(target_node); double d2; - // try { // score of adding (target_node -> source_collapsed) d2 = d + score.local_score(model, source_node, parents_source) - // New score with the added arc this->m_local_cache->local_score(model, source_node); // Old score without the arc - // } catch (const util::singular_covariance_data& e) { - // util::formatted_log_t(verbose, log_str + e.what()); - // d2 = std::numeric_limits::lowest(); - - // valid_op(source_collapsed, target_collapsed) = false; - // valid_op(target_collapsed, source_collapsed) = false; - - // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); - // } delta(target_collapsed, source_collapsed) = d2; // score of reversing (source_collapsed -> target_node) to (target_node -> // source_collapsed) @@ -379,10 +363,6 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode target_node)) { // ARC FLIPPING target_node -> source_node to source_node -> target_node: // If the arc target_node -> source_node already exists and the reverse arc // is possible, then put the flip the arc to source_node -> target_node. - // util::formatted_log_t(verbose, - // log_str + - // "model.has_arc(target_node, source_node) bn_type->can_have_arc(model, " - // "source_node, target_node) TBC"); auto parents_source = model.parents(source_node); util::swap_remove_v(parents_source, target_node); // Remove target_node from the parents of source_node @@ -390,7 +370,6 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode // Update flip arc score: target_node -> source_node to source_node -> target_node double d; - // try { d = score.local_score(model, target_node, parents) + // New score after adding source_node as parent of target_node @@ -401,45 +380,15 @@ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& mode this->m_local_cache->local_score(model, target_node) - this->m_local_cache->local_score(model, source_node); - // } catch (const util::singular_covariance_data& e) { - // // In case singular covariance data is found, the operation is marked as invalid in both arc - // // directions and the delta is set to the lowest possible value - // (ArcOperatorSet::update_valid_ops). util::formatted_log_t(verbose, log_str + e.what()); d = - // std::numeric_limits::lowest(); - - // valid_op(source_collapsed, target_collapsed) = false; - // valid_op(target_collapsed, source_collapsed) = false; - // delta(source_collapsed, target_collapsed) = d; - // delta(target_collapsed, source_collapsed) = d; - - // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); - // } - parents.pop_back(); // TODO: Is necessary parents_source.push_back(target_node);? delta(source_collapsed, target_collapsed) = d; } else if (bn_type->can_have_arc(model, source_node, target_node)) { // Update add arc: source_node -> target_node - // util::formatted_log_t(verbose, log_str + "bn_type->can_have_arc(model, source_node, target_node) - // TBC"); parents.push_back(source_node); double d; - // try { d = score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, target_node); - // } catch (const util::singular_covariance_data& e) { - // // In case singular covariance data is found, the operation is marked as invalid in both arc - // // directions and the delta is set to the lowest possible value - // // (ArcOperatorSet::update_valid_ops). util::formatted_log_t(verbose, log_str + e.what()); - // d = std::numeric_limits::lowest(); - - // valid_op(source_collapsed, target_collapsed) = false; - // valid_op(target_collapsed, source_collapsed) = false; - // delta(source_collapsed, target_collapsed) = d; - // delta(target_collapsed, source_collapsed) = d; - - // // util::formatted_log_t(verbose, log_str + "valid_op and delta updated"); - // } + this->m_local_cache->local_score(model, target_node); parents.pop_back(); delta(source_collapsed, target_collapsed) = d; } From e2f80f74be50e5d9e3cba3c00d94f8718eaf99d1 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 17:02:30 +0200 Subject: [PATCH 16/75] 0.5.2 changelog updated --- CHANGELOG.md | 14 +++++++++++--- docs/source/changelog.rst | 9 +++++++++ expand_sources.py | 1 + 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a6b8836..3129489e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## v0.5.2 + +- Python code is now formatted with `black` and `isort`, and has been refactored according to `PEP 8` style guides. +- Python code partially commented with `google` docstring format. +- C++ code partially commented with `doxygen` docstring format. +- Scott's and Normal Reference Rule's `bandwidth` calculation have been reordered and commented. +- `ArcOperatorSet::update_incoming_arcs_scores` formulas have been reordered and commented. + ## v0.5.1 - Fixes vcpkg bad hashes ([vcpkg/#38974](https://github.com/microsoft/vcpkg/issues/38974)). @@ -101,7 +109,7 @@ build process is simpler and orchestrated by scikit-build-core and a CMakeLists. - Added a `ProductKDE` class that implements `KDE` with diagonal bandwidth matrix. - Added an abstract class `BandwidthSelector` to implement bandwidth selection for `KDE` and `ProductKDE`. Three concrete implementations of bandwidth selection are included: `ScottsBandwidth`, `NormalReferenceRule` and `UCV`. -- Added `Arguments`, `Args` and `Kwargs` to store a set of arguments to be used to create new factors through +- Added `arguments`, `args` and `kwargs` to store a set of arguments to be used to create new factors through `FactorType::new_factor()`. The `Arguments` are accepted by `BayesianNetworkBase::fit()` and the constructors of `CVLikelihood`, `HoldoutLikelihood` and `ValidatedLikelihood`. @@ -113,8 +121,8 @@ build process is simpler and orchestrated by scikit-build-core and a CMakeLists. ## v0.2.0 - Added conditional linear Gaussian networks (`CLGNetworkType`, `CLGNetwork`, `ConditionalCLGNetwork` and `DynamicCLGNetwork`). -- Implemented `ChiSquare` (and `DynamicChiSquare`) indepencence test. -- Implemented `MutualInformation` (and `DynamicMutualInformation`) indepencence test. This is valid for hybrid data. +- Implemented `ChiSquare` (and `DynamicChiSquare`) independence test. +- Implemented `MutualInformation` (and `DynamicMutualInformation`) independence test. This is valid for hybrid data. - Implemented `BDe` (Bayesian Dirichlet equivalent) score (and `DynamicBDe`). - Added `UnknownFactorType` as default `FactorType` for Bayesian networks when the node type could not be deduced. - Added `Assignment` class to represent the assignment of values to variables. diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 38042515..9e3e0b29 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -2,6 +2,15 @@ Changelog ********* +v0.5.2 +====== + +- Python code is now formatted with ``black`` and ``isort``, and has been refactored according to ``PEP 8`` style guides. +- Python code partially commented with ``google`` docstring format. +- C++ code partially commented with ``doxygen`` docstring format. +- Scott's and Normal Reference Rule's ``bandwidth`` calculation have been reordered and commented. +- ``ArcOperatorSet::update_incoming_arcs_scores`` formulas have been reordered and commented. + v0.5.1 ====== diff --git a/expand_sources.py b/expand_sources.py index 52c24db2..455db27f 100644 --- a/expand_sources.py +++ b/expand_sources.py @@ -1,4 +1,5 @@ import os + import conv_template From f7ee6c385838a4ae3096dace701c7fa3b6c0ef05 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 17:39:33 +0200 Subject: [PATCH 17/75] bandwidth covariances without the diagonal --- pybnesian/kde/NormalReferenceRule.hpp | 45 +++++++++++++++++---------- pybnesian/kde/ScottsBandwidth.hpp | 9 ++++++ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index da640c13..f2fcf95c 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -106,16 +106,16 @@ class NormalReferenceRule : public BandwidthSelector { auto cov_ptr = df.cov(variables); auto& cov = *cov_ptr; - - if (!util::is_psd(cov)) { - std::stringstream ss; - ss << "NormalReferenceRule::diag_bandwidth -> Covariance matrix for variables [" << variables[0]; - for (size_t i = 1; i < variables.size(); ++i) { - ss << ", " << variables[i]; - } - ss << "] is not positive-definite."; - throw util::singular_covariance_data(ss.str()); - } + // NOTE: UNNECESSARY CHECK + // if (!util::is_psd(cov)) { + // std::stringstream ss; + // ss << "NormalReferenceRule::diag_bandwidth -> Covariance matrix for variables [" << variables[0]; + // for (size_t i = 1; i < variables.size(); ++i) { + // ss << ", " << variables[i]; + // } + // ss << "] is not positive-definite."; + // throw util::singular_covariance_data(ss.str()); + // } // The covariance diagonal is used to calculate the bandwidth auto diag = cov.diagonal(); auto delta = (cov.array().colwise() * diag.cwiseInverse().array()).matrix(); // diag(cov)^ (-1) * cov @@ -153,14 +153,25 @@ class NormalReferenceRule : public BandwidthSelector { auto cov_ptr = df.cov(variables); auto& cov = *cov_ptr; - if (!util::is_psd(cov)) { - std::stringstream ss; - ss << "Covariance matrix for variables [" << variables[0]; - for (size_t i = 1; i < variables.size(); ++i) { - ss << ", " << variables[i]; + + // NOTE: UNNECESSARY CHECK + // if (!util::is_psd(cov)) { + // std::stringstream ss; + // ss << "Covariance matrix for variables [" << variables[0]; + // for (size_t i = 1; i < variables.size(); ++i) { + // ss << ", " << variables[i]; + // } + // ss << "] is not positive-definite."; + // throw util::singular_covariance_data(ss.str()); + // } + // TODO: OPTIMIZE THIS + // We put the non-diagonal elements to zero + for (auto i = 0; i < cov.rows(); ++i) { + for (auto j = 0; j < cov.cols(); ++j) { + if (i != j) { + cov(i, j) = 0; + } } - ss << "] is not positive-definite."; - throw util::singular_covariance_data(ss.str()); } auto N = static_cast(df.valid_rows(variables)); diff --git a/pybnesian/kde/ScottsBandwidth.hpp b/pybnesian/kde/ScottsBandwidth.hpp index 99c686d3..8180b8b4 100644 --- a/pybnesian/kde/ScottsBandwidth.hpp +++ b/pybnesian/kde/ScottsBandwidth.hpp @@ -138,6 +138,15 @@ class ScottsBandwidth : public BandwidthSelector { ss << "] is not positive-definite."; throw util::singular_covariance_data(ss.str()); } + // TODO: OPTIMIZE THIS + // We put the non-diagonal elements to zero + for (auto i = 0; i < cov.rows(); ++i) { + for (auto j = 0; j < cov.cols(); ++j) { + if (i != j) { + cov(i, j) = 0; + } + } + } auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); From 33b0151394ce53a0b102ea8732b8f324e5f49fec Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 17:41:57 +0200 Subject: [PATCH 18/75] formatted_log_t verbose function --- pybnesian/util/progress.hpp | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pybnesian/util/progress.hpp b/pybnesian/util/progress.hpp index fb481c67..fe0f8ec5 100644 --- a/pybnesian/util/progress.hpp +++ b/pybnesian/util/progress.hpp @@ -2,9 +2,44 @@ #define PYBNESIAN_UTIL_PROGRESS_HPP #include +// #include +// #include // sudo apt install libboost-all-dev +#include +// enum class log_level_t { LOG_NOTHING, LOG_CRITICAL, LOG_ERROR, LOG_WARNING, LOG_INFO, LOG_DEBUG }; namespace util { +// auto GLOBAL_LEVEL = log_level_t::LOG_INFO; +class formatted_log_t { +public: + formatted_log_t(int verbose_level, std::string msg) : verbose_level(verbose_level), msg(msg) {} + ~formatted_log_t() { + // GLOBAL_LEVEL is a global variable and could be changed at runtime + // Any customization could be here + // if (level <= GLOBAL_LEVEL) + // std::wcout << static_cast(level) << L" " << fmt.str() + // << std::endl; // Convert level to a string before printing + if (verbose_level > 0) { + std::cout << msg << std::endl; + } + } + // template + // formatted_log_t& operator%(T value) { + // fmt % value; + // return *this; + // } + // formatted_log_t log(int verbose_level, const char* msg) { return formatted_log_t(verbose_level, msg); } + +protected: + int verbose_level; + std::string msg; +}; +// Helper function. Class formatted_log_t will not be used directly. +// template +// formatted_log_t log(const char* msg) { +// return formatted_log_t(verbose_level, msg); +// } + class BaseIndeterminateSpinner { public: virtual ~BaseIndeterminateSpinner() {} From 705d15e6fa0b877eb4073ee048c4e1dfb2a5d013 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 17:42:28 +0200 Subject: [PATCH 19/75] is_psd with Cholesky decomposition --- pybnesian/util/basic_eigen_ops.hpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pybnesian/util/basic_eigen_ops.hpp b/pybnesian/util/basic_eigen_ops.hpp index 99715918..11ae49b0 100644 --- a/pybnesian/util/basic_eigen_ops.hpp +++ b/pybnesian/util/basic_eigen_ops.hpp @@ -1,7 +1,9 @@ #ifndef PYBNESIAN_UTIL_BASIC_EIGEN_OPS_HPP #define PYBNESIAN_UTIL_BASIC_EIGEN_OPS_HPP +#include #include +#include namespace util { @@ -141,17 +143,22 @@ Matrix sqrt_matrix(const M& m) { * @return false If M is not positive definite. */ template -bool is_psd(const M& m) { +bool is_psd(const M& m, int verbose = 0) { using MatrixType = Matrix; - Eigen::SelfAdjointEigenSolver eigen_solver(m, Eigen::EigenvaluesOnly); - auto tol = eigen_solver.eigenvalues().maxCoeff() * m.rows() * std::numeric_limits::epsilon(); + Eigen::LLT lltOfM(m); // compute the Cholesky decomposition of m - if (eigen_solver.eigenvalues().minCoeff() < tol) { + if (lltOfM.info() == Eigen::NumericalIssue) { + std::stringstream ss; + ss << "basic_eigen_ops.hpp::is_psd:\t" + << "C++ Matrix m:\n" + << m << "\nCHOLESKY: Possibly non semi-positive definite matrix!"; + std::string log_str = ss.str(); + util::formatted_log_t(verbose, log_str); return false; + } else { + return true; } - - return true; } } // namespace util From db52397eebfdd839d889dd67eace4c8a805abdb4 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 17:49:06 +0200 Subject: [PATCH 20/75] diagonal covariance matrix commented --- pybnesian/kde/NormalReferenceRule.hpp | 14 +++++++------- pybnesian/kde/ScottsBandwidth.hpp | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index f2fcf95c..dd890624 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -166,13 +166,13 @@ class NormalReferenceRule : public BandwidthSelector { // } // TODO: OPTIMIZE THIS // We put the non-diagonal elements to zero - for (auto i = 0; i < cov.rows(); ++i) { - for (auto j = 0; j < cov.cols(); ++j) { - if (i != j) { - cov(i, j) = 0; - } - } - } + // for (auto i = 0; i < cov.rows(); ++i) { + // for (auto j = 0; j < cov.cols(); ++j) { + // if (i != j) { + // cov(i, j) = 0; + // } + // } + // } auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); diff --git a/pybnesian/kde/ScottsBandwidth.hpp b/pybnesian/kde/ScottsBandwidth.hpp index 8180b8b4..45f8a896 100644 --- a/pybnesian/kde/ScottsBandwidth.hpp +++ b/pybnesian/kde/ScottsBandwidth.hpp @@ -140,13 +140,13 @@ class ScottsBandwidth : public BandwidthSelector { } // TODO: OPTIMIZE THIS // We put the non-diagonal elements to zero - for (auto i = 0; i < cov.rows(); ++i) { - for (auto j = 0; j < cov.cols(); ++j) { - if (i != j) { - cov(i, j) = 0; - } - } - } + // for (auto i = 0; i < cov.rows(); ++i) { + // for (auto j = 0; j < cov.cols(); ++j) { + // if (i != j) { + // cov(i, j) = 0; + // } + // } + // } auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); From 10554b1e6ce4c72115ebde192b50ec8724432aa1 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 18:07:05 +0200 Subject: [PATCH 21/75] try catch in bandwidth --- pybnesian/kde/KDE.hpp | 8 +++++++- pybnesian/kde/ProductKDE.hpp | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pybnesian/kde/KDE.hpp b/pybnesian/kde/KDE.hpp index 2071e873..249847ef 100644 --- a/pybnesian/kde/KDE.hpp +++ b/pybnesian/kde/KDE.hpp @@ -574,7 +574,13 @@ void KDE::_fit(const DataFrame& df) { auto d = m_variables.size(); // NOTE: Here the positive definiteness of the bandwidth is checked - m_bandwidth = m_bselector->bandwidth(df, m_variables); + try { + m_bandwidth = m_bselector->bandwidth(df, m_variables); + } catch (util::singular_covariance_data& e) { + std::cerr << "KDE::_fit:\t" << e.what() << std::endl; + throw e; + } + // Calculates the LLT decomposition matrix of the bandwidth matrix auto llt_cov = m_bandwidth.llt(); auto cholesky = llt_cov.matrixLLT(); diff --git a/pybnesian/kde/ProductKDE.hpp b/pybnesian/kde/ProductKDE.hpp index 33d2f086..d75de9c0 100644 --- a/pybnesian/kde/ProductKDE.hpp +++ b/pybnesian/kde/ProductKDE.hpp @@ -165,6 +165,12 @@ void ProductKDE::_fit(const DataFrame& df) { auto& opencl = OpenCLConfig::get(); + // NOTE: Here the positive definiteness of the bandwidth is checked + // if bandwidth is not positive definite, + // - try to add a small value to the diagonal? + // m_bandwidth = m_bandwidth + VectorXd::Constant(m_variables.size(), 1e-6); + + // - Add to blacklist and ignore this iteration? m_bandwidth = m_bselector->diag_bandwidth(df, m_variables); for (size_t i = 0; i < m_variables.size(); ++i) { From 4dc3c8a8d81573446517cd688fec3fa4ce648b52 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 18:15:39 +0200 Subject: [PATCH 22/75] hill climbing verbose logs --- .../learning/algorithms/hillclimbing.hpp | 74 ++++++++++++++++--- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/pybnesian/learning/algorithms/hillclimbing.hpp b/pybnesian/learning/algorithms/hillclimbing.hpp index 2c164dbb..cd39594d 100644 --- a/pybnesian/learning/algorithms/hillclimbing.hpp +++ b/pybnesian/learning/algorithms/hillclimbing.hpp @@ -103,6 +103,12 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, double epsilon, int patience, int verbose) { + std::string log_str = "HILL-CLIMBING::estimate_hc:\t"; + try { + util::formatted_log_t(verbose, log_str + "Begins"); + // We copy the arc_blacklist + // auto arc_blacklist_copy = arc_blacklist; + // Spinner for the progress bar auto spinner = util::indeterminate_spinner(verbose); spinner->update_status("Checking dataset..."); @@ -143,6 +149,16 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, spinner->update_status("Caching scores..."); + // NOTE: Here the score of each node is calculated (log-likelihood fit) + // Partiendo de que se empieza con los nodos sin padres, se calcula el score independiente, y da que no es 0 + // Opciones: + // 1. Se calcula el score independiente, y tras fallar el fit de log-likelihood se hace regularización? + // 2. Se elimina la variable? + // 3. Hacer try-catch para que cuando de error, se añada regularización al score + + // TODO: Peta si hay variables sueltas con varianza 0 al hacer cross-validation -> Arreglar? + // Initializes the local validation scores for the current model + util::formatted_log_t(verbose, log_str + "Local Validation TBC"); LocalScoreCache local_validation = [&]() { // Local validation scores (lambda expression) if constexpr (std::is_base_of_v) { // If the score is a ValidatedScore LocalScoreCache lc(*current_model); // Local score cache @@ -155,18 +171,27 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, } }(); - op_set.cache_scores(*current_model, score); + util::formatted_log_t(verbose, log_str + "Local Validation Calculated"); + // Cache scores + util::formatted_log_t(verbose, log_str + "op_set.cache_scores TBC"); + // Caches the delta score values of each operator in the set. + op_set.cache_scores(*current_model, score); int p = 0; double accumulated_offset = 0; + util::formatted_log_t(verbose, log_str + "Scores cached"); OperatorTabuSet tabu_set; if (callback) callback->call(*current_model, nullptr, score, 0); - + util::formatted_log_t(verbose, log_str + "Hill climbing iterations begin"); + // Hill climbing iterations begin auto iter = 0; while (iter < max_iters) { ++iter; - + // Finds the best operator + // HC Algorithm lines 8 -> 16 [Atienza et al. (2022)] + // NOTE: Here the best operators are evaluated (log-likelihood fit) + util::formatted_log_t(verbose, log_str + "Best operator TBC"); auto best_op = [&]() { if constexpr (zero_patience) return op_set.find_max(*current_model); @@ -175,13 +200,22 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, }(); if (!best_op || (best_op->delta() - epsilon) < util::machine_tol) { + util::formatted_log_t(verbose, log_str + "No improvement in best_op"); break; } + util::formatted_log_t(verbose, log_str + "Best operator Calculated" + best_op->ToString()); + // If the best operator is nullptr or the delta is less than epsilon, then the search process fails and + // stops + // S_validation puede pasar try { Algorithm lines 17 -> 24 [Atienza et al. (2022)] Applies the best operator + // to the current model best_op->apply(*current_model); - + // Returns the nodes changed by the best operator auto nodes_changed = best_op->nodes_changed(*current_model); + // Calculates the validation delta + util::formatted_log_t(verbose, log_str + "Validation Delta TBC"); + double validation_delta = [&]() { if constexpr (std::is_base_of_v) { return validation_delta_score(*current_model, score, nodes_changed, local_validation); @@ -189,8 +223,12 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, return best_op->delta(); } }(); - - if ((validation_delta + accumulated_offset) > util::machine_tol) { + util::formatted_log_t(verbose, log_str + "Validation Delta Calculated"); + // Updates the best model if the validation delta is greater than 0 + if ((validation_delta + accumulated_offset) > + util::machine_tol) { // If the validation delta is greater than 0, then the current model is the best + // model + util::formatted_log_t(verbose, log_str + "Validation Delta is greater than 0"); if constexpr (!zero_patience) { if (p > 0) { best_model = current_model; @@ -200,7 +238,8 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, tabu_set.clear(); } - } else { + } else { // If the validation delta is less than 0, then the current model is not the best model + util::formatted_log_t(verbose, log_str + "Validation Delta is less than 0"); if constexpr (zero_patience) { best_model = prev_current_model; break; @@ -217,8 +256,10 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, if (callback) callback->call(*current_model, best_op.get(), score, iter); - op_set.update_scores(*current_model, score, nodes_changed); - + util::formatted_log_t(verbose, log_str + "Updating scores"); + // NOTE: Here the scores node are reevaluated (log-likelihood fit) + op_set.update_scores(*current_model, score, nodes_changed); + util::formatted_log_t(verbose, log_str + "Scores updated"); if constexpr (std::is_base_of_v) { spinner->update_status(best_op->ToString() + " | Validation delta: " + std::to_string(validation_delta)); @@ -236,6 +277,21 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, spinner->mark_as_completed("Finished Hill-climbing!"); return best_model; + } catch (util::singular_covariance_data& e) { + util::formatted_log_t(verbose, log_str + "catch"); + throw e; + // auto arc_best_op = dynamic_cast(best_op.get()); + // auto source_arc = arc_best_op->source(); + // auto target_arc = arc_best_op->target(); + + // std::cout << e.what() << std::endl; + // std::cout << "Source arc:\t" << source_arc << std::endl; + // std::cout << "Target arc:\t" << target_arc << std::endl; + + // arc_blacklist_copy.push_back(std::make_pair(source_arc, target_arc)); + // std::cout << "New arc_blacklist:\t" << arc_blacklist << std::endl; + // op_set.set_arc_blacklist(arc_blacklist_copy); + } } /** * @brief Depending on the validated_score and the patience of the hill climbing algorithm it estimates the From 0cbd688d4ad13481344b54c70a136304d9b3e973 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Sep 2024 18:18:02 +0200 Subject: [PATCH 23/75] indentation fix --- .../learning/algorithms/hillclimbing.hpp | 166 +++++++++--------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/pybnesian/learning/algorithms/hillclimbing.hpp b/pybnesian/learning/algorithms/hillclimbing.hpp index cd39594d..75bfabea 100644 --- a/pybnesian/learning/algorithms/hillclimbing.hpp +++ b/pybnesian/learning/algorithms/hillclimbing.hpp @@ -110,44 +110,44 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, // auto arc_blacklist_copy = arc_blacklist; // Spinner for the progress bar - auto spinner = util::indeterminate_spinner(verbose); - spinner->update_status("Checking dataset..."); + auto spinner = util::indeterminate_spinner(verbose); + spinner->update_status("Checking dataset..."); // Model initialization - auto current_model = start.clone(); + auto current_model = start.clone(); // Model type validation - current_model->force_type_whitelist(type_whitelist); - if (current_model->has_unknown_node_types()) { - auto score_data = score.data(); + current_model->force_type_whitelist(type_whitelist); + if (current_model->has_unknown_node_types()) { + auto score_data = score.data(); - if (score_data->num_columns() == 0) { - throw std::invalid_argument( - "The score does not have data to detect the node types. Set the node types for" + if (score_data->num_columns() == 0) { + throw std::invalid_argument( + "The score does not have data to detect the node types. Set the node types for" " all the nodes in the Bayesian network or use an score that uses data (it implements " "Score::data)."); - } + } - score_data.raise_has_columns(current_model->nodes()); - current_model->set_unknown_node_types(score_data, type_blacklist); - } -// Model arc validation + score_data.raise_has_columns(current_model->nodes()); + current_model->set_unknown_node_types(score_data, type_blacklist); + } + // Model arc validation current_model->check_blacklist( arc_blacklist); // Checks whether the arc_blacklist is valid for the current_model current_model->force_whitelist(arc_whitelist); // Include the given whitelisted arcs. It checks the validity of // the graph after including the arc whitelist. - // OperatorSet initialization - op_set.set_arc_blacklist(arc_blacklist); - op_set.set_arc_whitelist(arc_whitelist); - op_set.set_type_blacklist(type_blacklist); - op_set.set_type_whitelist(type_whitelist); - op_set.set_max_indegree(max_indegree); + // OperatorSet initialization + op_set.set_arc_blacklist(arc_blacklist); + op_set.set_arc_whitelist(arc_whitelist); + op_set.set_type_blacklist(type_blacklist); + op_set.set_type_whitelist(type_whitelist); + op_set.set_max_indegree(max_indegree); // Search model initialization - auto prev_current_model = current_model->clone(); - auto best_model = current_model; + auto prev_current_model = current_model->clone(); + auto best_model = current_model; - spinner->update_status("Caching scores..."); + spinner->update_status("Caching scores..."); // NOTE: Here the score of each node is calculated (log-likelihood fit) // Partiendo de que se empieza con los nodos sin padres, se calcula el score independiente, y da que no es 0 @@ -163,120 +163,120 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, if constexpr (std::is_base_of_v) { // If the score is a ValidatedScore LocalScoreCache lc(*current_model); // Local score cache lc.cache_vlocal_scores(*current_model, score); // Cache the local scores - return lc; + return lc; } else if constexpr (std::is_base_of_v) { // If the score is a generic Score - return LocalScoreCache{}; - } else { - static_assert(util::always_false, "Wrong Score class for hill-climbing."); - } - }(); + return LocalScoreCache{}; + } else { + static_assert(util::always_false, "Wrong Score class for hill-climbing."); + } + }(); util::formatted_log_t(verbose, log_str + "Local Validation Calculated"); // Cache scores util::formatted_log_t(verbose, log_str + "op_set.cache_scores TBC"); // Caches the delta score values of each operator in the set. op_set.cache_scores(*current_model, score); - int p = 0; - double accumulated_offset = 0; + int p = 0; + double accumulated_offset = 0; util::formatted_log_t(verbose, log_str + "Scores cached"); - OperatorTabuSet tabu_set; + OperatorTabuSet tabu_set; - if (callback) callback->call(*current_model, nullptr, score, 0); + if (callback) callback->call(*current_model, nullptr, score, 0); util::formatted_log_t(verbose, log_str + "Hill climbing iterations begin"); // Hill climbing iterations begin - auto iter = 0; - while (iter < max_iters) { - ++iter; + auto iter = 0; + while (iter < max_iters) { + ++iter; // Finds the best operator // HC Algorithm lines 8 -> 16 [Atienza et al. (2022)] // NOTE: Here the best operators are evaluated (log-likelihood fit) util::formatted_log_t(verbose, log_str + "Best operator TBC"); - auto best_op = [&]() { - if constexpr (zero_patience) - return op_set.find_max(*current_model); - else - return op_set.find_max(*current_model, tabu_set); - }(); - - if (!best_op || (best_op->delta() - epsilon) < util::machine_tol) { + auto best_op = [&]() { + if constexpr (zero_patience) + return op_set.find_max(*current_model); + else + return op_set.find_max(*current_model, tabu_set); + }(); + + if (!best_op || (best_op->delta() - epsilon) < util::machine_tol) { util::formatted_log_t(verbose, log_str + "No improvement in best_op"); - break; - } + break; + } util::formatted_log_t(verbose, log_str + "Best operator Calculated" + best_op->ToString()); // If the best operator is nullptr or the delta is less than epsilon, then the search process fails and // stops // S_validation puede pasar try { Algorithm lines 17 -> 24 [Atienza et al. (2022)] Applies the best operator // to the current model - best_op->apply(*current_model); + best_op->apply(*current_model); // Returns the nodes changed by the best operator - auto nodes_changed = best_op->nodes_changed(*current_model); + auto nodes_changed = best_op->nodes_changed(*current_model); // Calculates the validation delta util::formatted_log_t(verbose, log_str + "Validation Delta TBC"); - double validation_delta = [&]() { - if constexpr (std::is_base_of_v) { - return validation_delta_score(*current_model, score, nodes_changed, local_validation); - } else { - return best_op->delta(); - } - }(); + double validation_delta = [&]() { + if constexpr (std::is_base_of_v) { + return validation_delta_score(*current_model, score, nodes_changed, local_validation); + } else { + return best_op->delta(); + } + }(); util::formatted_log_t(verbose, log_str + "Validation Delta Calculated"); // Updates the best model if the validation delta is greater than 0 if ((validation_delta + accumulated_offset) > util::machine_tol) { // If the validation delta is greater than 0, then the current model is the best // model util::formatted_log_t(verbose, log_str + "Validation Delta is greater than 0"); - if constexpr (!zero_patience) { - if (p > 0) { - best_model = current_model; - p = 0; - accumulated_offset = 0; + if constexpr (!zero_patience) { + if (p > 0) { + best_model = current_model; + p = 0; + accumulated_offset = 0; + } + + tabu_set.clear(); } - - tabu_set.clear(); - } } else { // If the validation delta is less than 0, then the current model is not the best model util::formatted_log_t(verbose, log_str + "Validation Delta is less than 0"); - if constexpr (zero_patience) { - best_model = prev_current_model; - break; - } else { - if (p == 0) best_model = prev_current_model->clone(); - if (++p > patience) break; - accumulated_offset += validation_delta; + if constexpr (zero_patience) { + best_model = prev_current_model; + break; + } else { + if (p == 0) best_model = prev_current_model->clone(); + if (++p > patience) break; + accumulated_offset += validation_delta; tabu_set.insert(best_op->opposite(*current_model)); // Add the opposite operator to the tabu set + } } - } // Updates the previous current model - best_op->apply(*prev_current_model); + best_op->apply(*prev_current_model); - if (callback) callback->call(*current_model, best_op.get(), score, iter); + if (callback) callback->call(*current_model, best_op.get(), score, iter); util::formatted_log_t(verbose, log_str + "Updating scores"); // NOTE: Here the scores node are reevaluated (log-likelihood fit) op_set.update_scores(*current_model, score, nodes_changed); util::formatted_log_t(verbose, log_str + "Scores updated"); - if constexpr (std::is_base_of_v) { + if constexpr (std::is_base_of_v) { spinner->update_status(best_op->ToString() + " | Validation delta: " + std::to_string(validation_delta)); - } else if constexpr (std::is_base_of_v) { - spinner->update_status(best_op->ToString()); - } else { - static_assert(util::always_false, "Wrong Score class for hill-climbing."); - } + } else if constexpr (std::is_base_of_v) { + spinner->update_status(best_op->ToString()); + } else { + static_assert(util::always_false, "Wrong Score class for hill-climbing."); + } } // End of Hill climbing iterations - op_set.finished(); + op_set.finished(); - if (callback) callback->call(*best_model, nullptr, score, iter); + if (callback) callback->call(*best_model, nullptr, score, iter); - spinner->mark_as_completed("Finished Hill-climbing!"); - return best_model; + spinner->mark_as_completed("Finished Hill-climbing!"); + return best_model; } catch (util::singular_covariance_data& e) { util::formatted_log_t(verbose, log_str + "catch"); throw e; From 42d2e1887ceac75ca5a94f6bd8833d26f0d79111 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 16 Sep 2024 11:33:24 +0200 Subject: [PATCH 24/75] set unix compilers --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 52f06641..e84d6d6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,11 @@ IF(APPLE) SET(CMAKE_CXX_COMPILER "clang++") ENDIF() +IF(UNIX) + SET(CMAKE_C_COMPILER "gcc") + SET(CMAKE_CXX_COMPILER "g++") +ENDIF() + find_package(Git REQUIRED) message("Git executable: ${GIT_EXECUTABLE}") @@ -187,4 +192,3 @@ target_include_directories(__init__ SYSTEM PRIVATE "lib/eigen-3.3.7" "lib/indica target_link_libraries(__init__ PRIVATE Arrow::arrow_static OpenCL::OpenCL NLopt::nlopt libfort::fort Boost::dynamic_bitset Boost::math) install(TARGETS __init__ LIBRARY DESTINATION ./pybnesian) - From 54684ad4d953d3092f57a4be830aa9ad39666bd0 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 16 Sep 2024 12:25:34 +0200 Subject: [PATCH 25/75] ifdef warnings corrected --- CMakeLists.txt | 8 ++++++-- pybnesian/dataset/dataset.cpp | 12 ++++++------ pybnesian/dataset/dataset.hpp | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e84d6d6d..5b1e63b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,10 +85,14 @@ ELSEIF(UNIX) SET(SCRIPT_EXTENSION "sh") ENDIF() -execute_process(COMMAND python expand_sources.py RESULT_VARIABLE EXPAND_SOURCES_RESULT) +# Find the Python interpreter +find_package(PythonInterp 3 REQUIRED) + +# Use the found Python interpreter in the execute_process command +execute_process(COMMAND ${PYTHON_EXECUTABLE} expand_sources.py RESULT_VARIABLE EXPAND_SOURCES_RESULT) IF(NOT EXPAND_SOURCES_RESULT EQUAL "0") - message(FATAL_ERROR "$python expand_sources.py failed with ${EXPAND_SOURCES_RESULT}") + message(FATAL_ERROR "${PYTHON_EXECUTABLE} expand_sources.py failed with ${EXPAND_SOURCES_RESULT}") ENDIF() execute_process(COMMAND ${SCRIPT_PREFIX}bootstrap-vcpkg.${SCRIPT_EXTENSION} WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE VCPKG_BOOTSTRAP_RESULT) diff --git a/pybnesian/dataset/dataset.cpp b/pybnesian/dataset/dataset.cpp index b3c3d613..aef80da9 100644 --- a/pybnesian/dataset/dataset.cpp +++ b/pybnesian/dataset/dataset.cpp @@ -69,13 +69,13 @@ struct ArrowSchema* extract_pycapsule_schema(py::handle pyobject) { throw pybind11::attribute_error("Method __arrow_c_schema__ not found."); } - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 PyObject* schema_capsule_obj = PyObject_CallNoArgs(arrow_c_method); - #else +#else PyObject* args = PyTuple_New(0); PyObject* schema_capsule_obj = PyObject_Call(arrow_c_method, args, NULL); Py_DECREF(args); - #endif +#endif Py_DECREF(arrow_c_method); // extract the capsule @@ -94,13 +94,13 @@ struct ArrowCAPIObjects extract_pycapsule_array(py::handle pyobject) { throw pybind11::attribute_error("Method __arrow_c_array__ not found."); } - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 PyObject* array_capsule_tuple = PyObject_CallNoArgs(arrow_c_method); - #else +#else PyObject* args = PyTuple_New(0); PyObject* array_capsule_tuple = PyObject_Call(arrow_c_method, args, NULL); Py_DECREF(args); - #endif +#endif Py_DECREF(arrow_c_method); diff --git a/pybnesian/dataset/dataset.hpp b/pybnesian/dataset/dataset.hpp index 8c3fa813..7db48a45 100644 --- a/pybnesian/dataset/dataset.hpp +++ b/pybnesian/dataset/dataset.hpp @@ -2244,13 +2244,13 @@ struct type_caster> { PyObject* method_py = method.ptr(); - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 py::handle casted = PyObject_CallOneArg(method_py, schema_capsule); - #else +#else PyObject* args = PyTuple_Pack(1, schema_capsule); py::handle casted = PyObject_Call(method_py, args, NULL); Py_DECREF(args); - #endif +#endif return casted; } From 045360a1c8480ade0ccffe0cdf4c195081cca011 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 1 Oct 2024 15:53:30 +0200 Subject: [PATCH 26/75] typo --- pybnesian/learning/algorithms/hillclimbing.cpp | 2 +- .../pybindings/pybindings_learning/pybindings_algorithms.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pybnesian/learning/algorithms/hillclimbing.cpp b/pybnesian/learning/algorithms/hillclimbing.cpp index 883a679a..e21a2703 100644 --- a/pybnesian/learning/algorithms/hillclimbing.cpp +++ b/pybnesian/learning/algorithms/hillclimbing.cpp @@ -34,7 +34,7 @@ namespace learning::algorithms { * to None. * @param score_str A string representing the score used to drive the search. The possible options are: “bic” for BIC, “bge” for BGe, “cv-lik” for CVLikelihood, “holdout-lik” for - HoldoutLikelihood, “validated-lik for ValidatedLikelihood. Defaults to "validated-lik". + HoldoutLikelihood, “validated-lik" for ValidatedLikelihood. Defaults to "validated-lik". * @param operators_str Set of operators in the search process. Defaults to ["arcs", "node_type"]. * @param arc_blacklist List of arcs blacklist (forbidden arcs). Defaults to []. * @param arc_whitelist List of arcs whitelist (forced arcs). Defaults to []. diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp index 96a37ecb..96a75d67 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp @@ -103,7 +103,7 @@ Executes a greedy hill-climbing algorithm. This calls :func:`GreedyHillClimbing. "bic" for :class:`BIC `, "bge" for :class:`BGe `, "cv-lik" for :class:`CVLikelihood `, "holdout-lik" for - :class:`HoldoutLikelihood `, "validated-lik for + :class:`HoldoutLikelihood `, "validated-lik" for :class:`ValidatedLikelihood `. :param operators: Set of operators in the search process. :param arc_blacklist: List of arcs blacklist (forbidden arcs). From 3387401dafc9f10579445217067c87fc47c5c955 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 16 Oct 2024 18:19:33 +0200 Subject: [PATCH 27/75] DiscreteFactor_test fix --- tests/factors/discrete/DiscreteFactor_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 4e1894bc..f274d90e 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -2,11 +2,11 @@ import pandas as pd import pyarrow as pa import pytest +from util_test import generate_discrete_data_dependent import pybnesian as pbn -from util_test import generate_normal_data -df = util_test.generate_discrete_data_dependent(10000) +df = generate_discrete_data_dependent(10000) def test_data_type(): From 44c0e8dce4d0a5bae5ac74f6d125d6a5fc079b4e Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 24 Oct 2024 13:49:19 +0000 Subject: [PATCH 28/75] code formatted --- tests/factors/continuous/KDE_test.py | 2 +- tests/learning/scores/holdoutlikelihood_test.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index 529ade49..2d03284d 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -2,10 +2,10 @@ import pyarrow as pa import pytest from scipy.stats import gaussian_kde +from util_test import generate_normal_data import pybnesian as pbn from pybnesian import BandwidthSelector -from util_test import generate_normal_data SIZE = 500 df = generate_normal_data(SIZE, seed=0) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 21447064..6f064f17 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -46,9 +46,7 @@ def numpy_local_score( * s.scotts_factor(), ) if evidence: - k_marg = gaussian_kde( - evidence_data.to_numpy().T, bw_method=k_joint.factor - ) + k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.factor) loglik = np.sum( k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T) From 52613dfc09018c602d692ed3413ad17e7c91262c Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 24 Oct 2024 13:54:35 +0000 Subject: [PATCH 29/75] pyproject.toml formatted --- pyproject.toml | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1be96df0..8d601d67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,10 +13,10 @@ sdist.exclude = ["vcpkg/*", "docs/"] [project] name = "pybnesian" -authors = [{name = "David Atienza", email = "datienza@fi.upm.es"}] -description="PyBNesian is a Python package that implements Bayesian networks." +authors = [{ name = "David Atienza", email = "datienza@fi.upm.es" }] +description = "PyBNesian is a Python package that implements Bayesian networks." version = "0.5.1" -readme = {file = "README.md", content-type = "text/markdown"} +readme = { file = "README.md", content-type = "text/markdown" } license = { file = "LICENSE" } requires-python = ">=3.8" classifiers = [ @@ -24,35 +24,28 @@ classifiers = [ "Programming Language :: C++", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Topic :: Scientific/Engineering :: Artificial Intelligence" + "Topic :: Scientific/Engineering :: Artificial Intelligence", ] keywords = [] -dependencies = [ - "pybind11>=2.6", - "pyarrow>=14", - "numpy" -] +dependencies = ["pybind11>=2.6", "pyarrow>=14", "numpy"] [project.urls] -homepage = "https://github.com/davenza/PyBNesian" # FIXME not shown by pip +homepage = "https://github.com/davenza/PyBNesian" # FIXME not shown by pip documentation = "https://pybnesian.readthedocs.io/en/latest/?badge=latest" changelog = "https://pybnesian.readthedocs.io/en/latest/changelog.html" [tool.cibuildwheel] -skip=["pp*", - "*-win32", - "*-musllinux*", - "*i686*", - "*ppc64le*", - "*s390x*"] +skip = ["pp*", "*-win32", "*-musllinux*", "*i686*", "*ppc64le*", "*s390x*"] [tool.cibuildwheel.linux] before-all = "yum install -y zip unzip kernel-headers perl-IPC-Cmd flex opencl-headers ocl-icd ocl-icd-devel" [tool.cibuildwheel.macos] -before-all = ["sudo xcodebuild -runFirstLaunch", - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer", - "brew install bison ninja", - "export CMAKE_GENERATOR=Xcode", - "export MACOSX_DEPLOYMENT_TARGET=10.14", - "export VCPKG_ENV_PASSTHROUGH=MACOSX_DEPLOYMENT_TARGET"] \ No newline at end of file +before-all = [ + "sudo xcodebuild -runFirstLaunch", + "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer", + "brew install bison ninja", + "export CMAKE_GENERATOR=Xcode", + "export MACOSX_DEPLOYMENT_TARGET=10.14", + "export VCPKG_ENV_PASSTHROUGH=MACOSX_DEPLOYMENT_TARGET", +] From c8c66a85277a4296b6364ac6e15f03d7a84c4d46 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 21 Nov 2024 11:39:40 +0000 Subject: [PATCH 30/75] warning fix --- conv_template.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conv_template.py b/conv_template.py index c911c64f..75e36029 100644 --- a/conv_template.py +++ b/conv_template.py @@ -281,7 +281,7 @@ def process_file(source): return '#line 1 "%s"\n%s' % (sourcefile, code) -def unique_key(adict): +def unique_key(adict: dict) -> str: # this obtains a unique key given a dictionary # currently it works by appending together n of the letters of the # current keys and increasing n until a unique key is found @@ -289,6 +289,7 @@ def unique_key(adict): allkeys = list(adict.keys()) done = False n = 1 + newkey = "" while not done: newkey = "".join([x[:n] for x in allkeys]) if newkey in allkeys: @@ -299,6 +300,7 @@ def unique_key(adict): def main(): + file = None try: file = sys.argv[1] except IndexError: From 1d28196e567650697ed305e6a10b76bdce64346f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 21 Nov 2024 12:17:12 +0000 Subject: [PATCH 31/75] eigen warnings fixed --- lib/eigen-3.3.7/debug/gdb/printers.py | 6 ++--- lib/eigen-3.3.7/scripts/relicense.py | 34 +++++++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lib/eigen-3.3.7/debug/gdb/printers.py b/lib/eigen-3.3.7/debug/gdb/printers.py index b6ab74a7..2b5f9f1b 100644 --- a/lib/eigen-3.3.7/debug/gdb/printers.py +++ b/lib/eigen-3.3.7/debug/gdb/printers.py @@ -26,9 +26,9 @@ # register_eigen_printers (None) # end -import gdb import re -import itertools + +import gdb class EigenMatrixPrinter: @@ -46,7 +46,7 @@ def __init__(self, variety, val): type = type.target() self.type = type.unqualified().strip_typedefs() tag = self.type.tag - regex = re.compile("\<.*\>") + regex = re.compile(r"\<.*\>") m = regex.findall(tag)[0][1:-1] template_params = m.split(",") template_params = [x.replace(" ", "") for x in template_params] diff --git a/lib/eigen-3.3.7/scripts/relicense.py b/lib/eigen-3.3.7/scripts/relicense.py index 8a5265f1..1179db00 100644 --- a/lib/eigen-3.3.7/scripts/relicense.py +++ b/lib/eigen-3.3.7/scripts/relicense.py @@ -11,7 +11,7 @@ # # Make the long-awaited conversion to MPL. -lgpl3_header = ''' +lgpl3_header = """ // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -30,7 +30,7 @@ // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . -''' +""" mpl2_header = """ // This Source Code Form is subject to the terms of the Mozilla @@ -41,29 +41,29 @@ import os import sys -exclusions = set(['relicense.py']) +exclusions = set(["relicense.py"]) + def update(text): - if text.find(lgpl3_header) == -1: - return text, False - return text.replace(lgpl3_header, mpl2_header), True + if text.find(lgpl3_header) == -1: + return text, False + return text.replace(lgpl3_header, mpl2_header), True + rootdir = sys.argv[1] for root, sub_folders, files in os.walk(rootdir): for basename in files: - if basename in exclusions: - print 'SKIPPED', filename - continue filename = os.path.join(root, basename) - fo = file(filename) - text = fo.read() - fo.close() + if basename in exclusions: + print("SKIPPED", filename) + continue + with open(filename, "r") as fo: + text = fo.read() text, updated = update(text) if updated: - fo = file(filename, "w") - fo.write(text) - fo.close() - print 'UPDATED', filename + with open(filename, "w") as fo: + fo.write(text) + print("UPDATED", filename) else: - print ' ', filename + print(" ", filename) From c84a04dd48a9522713a84238e18fb56f024a997f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 25 Nov 2024 11:33:05 +0000 Subject: [PATCH 32/75] util_test upgraded and renamed --- tests/helpers/util_test.py | 341 +++++++++++++++++++++----- tests/learning/parameters/mle_test.py | 3 +- 2 files changed, 277 insertions(+), 67 deletions(-) diff --git a/tests/helpers/util_test.py b/tests/helpers/util_test.py index f368bee5..d8ba0d12 100644 --- a/tests/helpers/util_test.py +++ b/tests/helpers/util_test.py @@ -1,8 +1,27 @@ import numpy as np import pandas as pd - -def generate_normal_data(size, seed=0): +TRUE_LABEL = "class_label" +DATA_SIZE = 10000 +SEED = 0 + + +# TODO: Copy to pybnesian +def generate_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships. + The relationships are as follows: + - A ~ N(3, 0.5) + - B ~ N(2.5 + 1.65 * A, 2) + - C ~ N(-4.2 - 1.2 * A + 3.2 * B, 0.75) + - D ~ N(1.5 - 0.9 * A + 5.6 * B + 0.3 * C, 0.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ np.random.seed(seed) a_array = np.random.normal(3, 0.5, size=size) @@ -17,57 +36,120 @@ def generate_normal_data(size, seed=0): + 0.3 * c_array + np.random.normal(0, 0.5, size=size) ) + df = pd.DataFrame({"A": a_array, "B": b_array, "C": c_array, "D": d_array}) + + return df - return pd.DataFrame({"a": a_array, "b": b_array, "c": c_array, "d": d_array}) +def generate_normal_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships and independent variables. + The relationships are as follows: + - A ~ N(3, 0.5) + - B ~ N(2.5, 2) + - C ~ N(-4.2, 0.75) + - D ~ N(1.5, 0.5) -def generate_normal_data_indep(size, seed=0): + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ np.random.seed(seed) a_array = np.random.normal(3, 0.5, size=size) b_array = np.random.normal(2.5, 2, size=size) - c_array = ( - -4.2 - 1.2 * a_array + 3.2 * b_array + np.random.normal(0, 0.75, size=size) - ) - d_array = 1.5 - 0.3 * c_array + np.random.normal(0, 0.5, size=size) + c_array = np.random.normal(-4.2, 0.75, size=size) + d_array = np.random.normal(1.5, 0.5, size=size) + + df = pd.DataFrame({"A": a_array, "B": b_array, "C": c_array, "D": d_array}) + return df + - return pd.DataFrame({"a": a_array, "b": b_array, "c": c_array, "d": d_array}) +def generate_non_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of uniformly distributed data with non-linear relationships. + The relationships are as follows: + - A ~ U(0, 10) + - B ~ U(5, 15) + - C ~ sin(A) + cos(B) + U(-1, 1) + - D ~ exp(A / 10) + log(B + 1) + U(-0.5, 0.5) + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. -def generate_discrete_data_uniform(size, seed=0): + Returns: + pd.DataFrame: The DataFrame. + """ np.random.seed(seed) - a_dict = np.asarray(["a1", "a2"]) - b_dict = np.asarray(["b1", "b2", "b3"]) - c_dict = np.asarray(["c1", "c2"]) - d_dict = np.asarray(["d1", "d2", "d3", "d4"]) + # Generate uniformly distributed data + a_values = np.random.uniform(0, 10, size) + b_values = np.random.uniform(5, 15, size) - return pd.DataFrame( + # Generate non-linear relationships + c_values = np.sin(a_values) + np.cos(b_values) + np.random.uniform(-1, 1, size) + d_values = ( + np.exp(a_values / 10) + + np.log(b_values + 1) + + np.random.uniform(-0.5, 0.5, size) + ) + + # DataFrame + df = pd.DataFrame( { - "A": a_dict[np.random.randint(0, a_dict.size, size=size)], - "B": b_dict[np.random.randint(0, b_dict.size, size=size)], - "C": c_dict[np.random.randint(0, c_dict.size, size=size)], - "D": d_dict[np.random.randint(0, d_dict.size, size=size)], - }, - dtype="category", + "A": a_values, + "B": b_values, + "C": c_values, + "D": d_values, + } ) + return df + +def generate_discrete_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with dependent variables. + The relationships are as follows: + - A ~ Categorical(0.75, 0.25) + - B ~ Categorical(0.33, 0.33, 0.34) if A = a1, else Categorical(0, 0.8, 0.2) + - C ~ Categorical(0.5, 0.5) if A = a1 and B = b1, else Categorical(0.75, 0.25) if A = a1 and B = b2, else Categorical(0.2, 0.8) if A = a1 and B = b3, else Categorical(1, 0) if A = a2 and B = b1, else Categorical(0, 1) if A = a2 and B = b2, else Categorical(0.01, 0.99) if A = a2 and B = b3 + - D ~ Categorical(0.25, 0.25, 0.25, 0.25) if C = c1, else Categorical(0.7, 0, 0.15, 0.15) if C = c2 -def generate_discrete_data_dependent(size, seed=0): + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization np.random.seed(seed) - a_dict = np.asarray(["a1", "a2"]) - b_dict = np.asarray(["b1", "b2", "b3"]) - c_dict = np.asarray(["c1", "c2"]) - d_dict = np.asarray(["d1", "d2", "d3", "d4"]) + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + d_dict = np.asarray(["D1", "D2", "D3", "D4"]) a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] b_values = np.empty_like(a_values) c_values = np.empty_like(a_values) d_values = np.empty_like(a_values) - a1_indices = a_values == "a1" + # Indices + a1_indices = a_values == "A1" + + a1b1_indices = np.logical_and(a_values == "A1", b_values == "B1") + a1b2_indices = np.logical_and(a_values == "A1", b_values == "B2") + a1b3_indices = np.logical_and(a_values == "A1", b_values == "B3") + a2b1_indices = np.logical_and(a_values == "A2", b_values == "B1") + a2b2_indices = np.logical_and(a_values == "A2", b_values == "B2") + a2b3_indices = np.logical_and(a_values == "A2", b_values == "B3") + c1_indices = c_values == "C1" + c2_indices = c_values == "C2" + + # Sampling b_values[a1_indices] = b_dict[ np.random.choice(b_dict.size, np.sum(a1_indices), p=[0.33, 0.33, 0.34]) ] @@ -75,13 +157,6 @@ def generate_discrete_data_dependent(size, seed=0): np.random.choice(b_dict.size, np.sum(~a1_indices), p=[0, 0.8, 0.2]) ] - a1b1_indices = np.logical_and(a_values == "a1", b_values == "b1") - a1b2_indices = np.logical_and(a_values == "a1", b_values == "b2") - a1b3_indices = np.logical_and(a_values == "a1", b_values == "b3") - a2b1_indices = np.logical_and(a_values == "a2", b_values == "b1") - a2b2_indices = np.logical_and(a_values == "a2", b_values == "b2") - a2b3_indices = np.logical_and(a_values == "a2", b_values == "b3") - c_values[a1b1_indices] = c_dict[ np.random.choice(c_dict.size, np.sum(a1b1_indices), p=[0.5, 0.5]) ] @@ -101,9 +176,6 @@ def generate_discrete_data_dependent(size, seed=0): np.random.choice(c_dict.size, np.sum(a2b3_indices), p=[0.01, 0.99]) ] - c1_indices = c_values == "c1" - c2_indices = c_values == "c2" - d_values[c1_indices] = d_dict[ np.random.choice(d_dict.size, np.sum(c1_indices), p=[0.25, 0.25, 0.25, 0.25]) ] @@ -111,38 +183,85 @@ def generate_discrete_data_dependent(size, seed=0): np.random.choice(d_dict.size, np.sum(c2_indices), p=[0.7, 0, 0.15, 0.15]) ] - return pd.DataFrame( + # DataFrame + df = pd.DataFrame( {"A": a_values, "B": b_values, "C": c_values, "D": d_values}, dtype="category" ) + return df + + +def generate_discrete_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with uniform distributions. + The relationships are as follows: + - A ~ Categorical(a1, a2) + - B ~ Categorical(b1, b2, b3) + - C ~ Categorical(c1, c2) + - D ~ Categorical(d1, d2, d3, d4) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + d_dict = np.asarray(["D1", "D2", "D3", "D4"]) + + # DataFrame + df = pd.DataFrame( + { + "A": a_dict[np.random.randint(0, a_dict.size, size=size)], + "B": b_dict[np.random.randint(0, b_dict.size, size=size)], + "C": c_dict[np.random.randint(0, c_dict.size, size=size)], + "D": d_dict[np.random.randint(0, d_dict.size, size=size)], + }, + dtype="category", + ) + return df + +def generate_hybrid_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of hybrid data with discrete and continuous variables. + The relationships are as follows: + - A ~ Categorical(0.75, 0.25) + - B ~ Categorical(0.3, 0.4, 0.3) if A = a1, else Categorical(0.2, 0.5, 0.3) + - C ~ N(-4.2, 0.75) + - D ~ N(1, 0.75) if A = a1 and B = b1, else N(-2 + C, 2) if A = a1 and B = b2, else N(-1 + 3 * C, 0.25) if A = a1 and B = b3, else N(2, 1) if A = a2 and B = b1, else N(3.5 - 1.2 * C, 1) if A = a2 and B = b2, else N(4.8 - 2 * C, 1.5) if A = a2 and B = b3 -def generate_hybrid_data(size, seed=0): - # - # Generate data from: - # - # A B C - # \ | / - # \ | / - # v - # D + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization np.random.seed(seed) - a_dict = np.asarray(["a1", "a2"]) + a_dict = np.asarray(["A1", "A2"]) a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] - b_dict = np.asarray(["b1", "b2", "b3"]) + b_dict = np.asarray(["B1", "B2", "B3"]) b_values = b_dict[np.random.choice(b_dict.size, size, p=[0.3, 0.4, 0.3])] c_values = -4.2 + np.random.normal(0, 0.75, size=size) + d_values = np.empty_like(c_values) - a1b1_indices = np.logical_and(a_values == "a1", b_values == "b1") - a1b2_indices = np.logical_and(a_values == "a1", b_values == "b2") - a1b3_indices = np.logical_and(a_values == "a1", b_values == "b3") - a2b1_indices = np.logical_and(a_values == "a2", b_values == "b1") - a2b2_indices = np.logical_and(a_values == "a2", b_values == "b2") - a2b3_indices = np.logical_and(a_values == "a2", b_values == "b3") + # Indices + a1b1_indices = np.logical_and(a_values == "A1", b_values == "B1") + a1b2_indices = np.logical_and(a_values == "A1", b_values == "B2") + a1b3_indices = np.logical_and(a_values == "A1", b_values == "B3") + a2b1_indices = np.logical_and(a_values == "A2", b_values == "B1") + a2b2_indices = np.logical_and(a_values == "A2", b_values == "B2") + a2b3_indices = np.logical_and(a_values == "A2", b_values == "B3") - d_values = np.empty_like(c_values) + # Sampling d_values[a1b1_indices] = np.random.normal(1, 0.75, size=a1b1_indices.sum()) d_values[a1b2_indices] = ( -2 + c_values[a1b2_indices] + np.random.normal(0, 2, size=a1b2_indices.sum()) @@ -164,7 +283,8 @@ def generate_hybrid_data(size, seed=0): + np.random.normal(0, 1.5, size=a2b3_indices.sum()) ) - return pd.DataFrame( + # DataFrame + df = pd.DataFrame( { "A": pd.Series(a_values, dtype="category"), "B": pd.Series(b_values, dtype="category"), @@ -172,23 +292,46 @@ def generate_hybrid_data(size, seed=0): "D": d_values, } ) - - -def generate_indep_hybrid_data(size, seed=0): + return df + + +def generate_hybrid_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of hybrid data with independent discrete and continuous variables. + The relationships are as follows: + - D2 ~ Categorical(0.5, 0.5) + - D3 ~ Categorical(0.33, 0.34, 0.33) + - D4 ~ Categorical(0.25, 0.25, 0.25, 0.25) + - D5 ~ Categorical(0.2, 0.2, 0.2, 0.2, 0.2) + - D6 ~ Categorical(0.166, 0.166, 0.166, 0.166, 0.166, 0.17) + - C1 ~ N(-4.2, 0.75) + - C2 ~ N(1, 2) + - C3 ~ N(2, 0.7) + - C4 ~ N(-3, 2.5) + - C5 ~ N(-1.2, 0.5) + - C6 ~ N(3, 1.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ np.random.seed(seed) - d2_dict = np.asarray(["a1", "a2"]) + # Sampling + d2_dict = np.asarray(["A1", "A2"]) d2_values = d2_dict[np.random.choice(d2_dict.size, size, p=[0.5, 0.5])] - d3_dict = np.asarray(["b1", "b2", "b3"]) + d3_dict = np.asarray(["B1", "B2", "B3"]) d3_values = d3_dict[np.random.choice(d3_dict.size, size, p=[0.33, 0.34, 0.33])] - d4_dict = np.asarray(["c1", "c2", "c3", "c4"]) + d4_dict = np.asarray(["C1", "C2", "C3", "C4"]) d4_values = d4_dict[ np.random.choice(d4_dict.size, size, p=[0.25, 0.25, 0.25, 0.25]) ] - d5_dict = np.asarray(["d1", "d2", "d3", "d4", "d5"]) + d5_dict = np.asarray(["D1", "D2", "D3", "D4", "D5"]) d5_values = d5_dict[ np.random.choice(d5_dict.size, size, p=[0.2, 0.2, 0.2, 0.2, 0.2]) ] @@ -207,7 +350,8 @@ def generate_indep_hybrid_data(size, seed=0): c5_values = np.random.normal(-1.2, 0.5, size=size) c6_values = np.random.normal(3, 1.5, size=size) - return pd.DataFrame( + # DataFrame + df = pd.DataFrame( { "D2": pd.Series(d2_values, dtype="category"), "D3": pd.Series(d3_values, dtype="category"), @@ -222,3 +366,70 @@ def generate_indep_hybrid_data(size, seed=0): "C6": c6_values, } ) + return df + + +def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships and a true label. + The relationships are as follows: + - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ N(-4.2, 0.75) + - B ~ N(0, 0.25) if class = class1, else N(1, 0.5) if class = class2, else N(2, 1) if class = class3 + - C ~ N(-2 + 2 * B, 1) if class = class1, else N(1 + 0.5 * B, 0.5) if class = class2, else N(3 + 3 * B, 0.25) if class = class3 + size (int): The sample + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + class_dict = np.asarray(["Class1", "Class2", "Class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_values = -4.2 + np.random.normal(0, 0.75, size=size) + + b_values = np.empty_like(a_values) + c_values = np.empty_like(a_values) + + # Indices + class1_indices = class_values == "Class1" + class2_indices = class_values == "Class2" + class3_indices = class_values == "Class3" + + # Sampling + # b_values based on class_values + b_values[class1_indices] = np.random.normal(0, 0.25, size=class1_indices.sum()) + b_values[class2_indices] = np.random.normal(1, 0.5, size=class2_indices.sum()) + b_values[class3_indices] = np.random.normal(2, 1, size=class3_indices.sum()) + + # c_values based on class_values and b_values + c_values[class1_indices] = ( + -2 + + 2 * b_values[class1_indices] + + np.random.normal(0, 1, size=class1_indices.sum()) + ) + c_values[class2_indices] = ( + 1 + + 0.5 * b_values[class2_indices] + + np.random.normal(0, 0.5, size=class2_indices.sum()) + ) + c_values[class3_indices] = ( + 3 + + 3 * b_values[class3_indices] + + np.random.normal(0, 0.25, size=class3_indices.sum()) + ) + + # DataFrame + df = pd.DataFrame( + { + TRUE_LABEL: pd.Series(class_values, dtype="category"), + "A": a_values, + "B": b_values, + "C": c_values, + } + ) + return df diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index aa8a9031..0ca091ef 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,9 +1,8 @@ import numpy as np +import pybnesian as pbn import pytest from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 df = generate_normal_data(SIZE) From febdea3d41d7604ac30e73d0405ee7fcab8e4717 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 25 Nov 2024 11:42:56 +0000 Subject: [PATCH 33/75] Tests adapted: Variable naming to uppercase --- tests/dataset/crossvalidation_test.py | 71 ++- tests/dataset/holdout_test.py | 27 +- tests/factors/continuous/CKDE_test.py | 235 ++++---- tests/factors/continuous/KDE_test.py | 111 ++-- .../continuous/LinearGaussianCPD_test.py | 151 +++-- tests/factors/continuous/ProductKDE_test.py | 111 ++-- tests/factors/discrete/DiscreteFactor_test.py | 13 +- tests/factors/factor_type_test.py | 43 +- .../learning/algorithms/hillclimbing_test.py | 31 +- tests/learning/operators/operatorpool_test.py | 5 +- tests/learning/operators/operators_test.py | 95 ++- tests/learning/operators/operatorset_test.py | 25 +- .../operators/operatorstabuset_test.py | 18 +- tests/learning/parameters/mle_test.py | 16 +- tests/learning/scores/bic_test.py | 75 ++- tests/learning/scores/cvlikelihood_test.py | 203 ++++--- .../learning/scores/holdoutlikelihood_test.py | 193 ++++--- tests/models/BayesianNetwork_test.py | 323 ++++++----- tests/models/BayesianNetwork_type_test.py | 79 ++- tests/models/DynamicBayesianNetwork_test.py | 111 ++-- tests/models/HeterogeneousBN_test.py | 17 +- tests/models/SemiparametricBN_test.py | 159 +++--- tests/serialization/serialize_factor_test.py | 97 ++-- tests/serialization/serialize_models_test.py | 539 +++++++++--------- 24 files changed, 1362 insertions(+), 1386 deletions(-) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 6fb10867..afff230f 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,7 +1,6 @@ import numpy as np -from util_test import generate_normal_data - import pybnesian as pbn +from util_test import generate_normal_data SIZE = 10000 @@ -98,73 +97,73 @@ def test_cv_num_folds(): def test_cv_loc(): cv = pbn.CrossValidation(df) - for train_df, test_df in cv.loc("a"): + for train_df, test_df in cv.loc("A"): assert ( train_df.num_columns == 1 - ), 'Only column "a" must be present in train DataFrame.' + ), 'Only column "A" must be present in train DataFrame.' assert ( test_df.num_columns == 1 - ), 'Only column "a" must be present in test DataFrame.' + ), 'Only column "A" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema assert train_schema.names == [ - "a" - ], 'Only column "a" must be present in train DataFrame.' + "A" + ], 'Only column "A" must be present in train DataFrame.' assert test_schema.names == [ - "a" - ], 'Only column "a" must be present in test DataFrame.' + "A" + ], 'Only column "A" must be present in test DataFrame.' for train_df, test_df in cv.loc(1): assert ( train_df.num_columns == 1 - ), 'Only column "b" must be present in train DataFrame.' + ), 'Only column "B" must be present in train DataFrame.' assert ( test_df.num_columns == 1 - ), 'Only column "b" must be present in test DataFrame.' + ), 'Only column "B" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema assert train_schema.names == [ - "b" - ], 'Only column "b" must be present in train DataFrame.' + "B" + ], 'Only column "B" must be present in train DataFrame.' assert test_schema.names == [ - "b" - ], 'Only column "b" must be present in test DataFrame.' + "B" + ], 'Only column "B" must be present in test DataFrame.' - for train_df, test_df in cv.loc(["b", "d"]): + for train_df, test_df in cv.loc(["B", "D"]): assert ( train_df.num_columns == 2 - ), 'Only columns ["b", "d"] must be present in train DataFrame.' + ), 'Only columns ["B", "D"] must be present in train DataFrame.' assert ( test_df.num_columns == 2 - ), 'Only column ["b", "d"] must be present in test DataFrame.' + ), 'Only column ["B", "D"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema assert train_schema.names == [ - "b", - "d", - ], 'Only column ["b", "d"] must be present in train DataFrame.' + "B", + "D", + ], 'Only column ["B", "D"] must be present in train DataFrame.' assert test_schema.names == [ - "b", - "d", - ], 'Only column ["b", "d"] must be present in test DataFrame.' + "B", + "D", + ], 'Only column ["B", "D"] must be present in test DataFrame.' for train_df, test_df in cv.loc([0, 2]): assert ( train_df.num_columns == 2 - ), 'Only columns ["a", "c"] must be present in train DataFrame.' + ), 'Only columns ["A", "C"] must be present in train DataFrame.' assert ( test_df.num_columns == 2 - ), 'Only column ["a", "c"] must be present in test DataFrame.' + ), 'Only column ["A", "C"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema assert train_schema.names == [ - "a", - "c", - ], 'Only column ["a", "c"] must be present in train DataFrame.' + "A", + "C", + ], 'Only column ["A", "C"] must be present in train DataFrame.' assert test_schema.names == [ - "a", - "c", - ], 'Only column ["a", "c"] must be present in test DataFrame.' + "A", + "C", + ], 'Only column ["A", "C"] must be present in test DataFrame.' def test_cv_null(): @@ -175,10 +174,10 @@ def test_cv_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan non_null = df_null.dropna() cv = pbn.CrossValidation(df_null) diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 73fe7913..773a5126 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,8 +1,7 @@ import numpy as np import pandas as pd -from util_test import generate_normal_data - import pybnesian as pbn +from util_test import generate_normal_data SIZE = 10000 @@ -28,9 +27,9 @@ def test_holdout_disjoint(): combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) assert ( - df.sort_values("a", axis=0) + df.sort_values("A", axis=0) .reset_index(drop=True) - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)) + .equals(combination.sort_values("A", axis=0).reset_index(drop=True)) ), "The combination of train and test dataset is not equal to the original DataFrame." hold = pbn.HoldOut(df, test_ratio=0.3) @@ -50,9 +49,9 @@ def test_holdout_disjoint(): combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) assert ( - df.sort_values("a", axis=0) + df.sort_values("A", axis=0) .reset_index(drop=True) - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)) + .equals(combination.sort_values("A", axis=0).reset_index(drop=True)) ), "The combination of train and test dataset is not equal to the original DataFrame." @@ -89,10 +88,10 @@ def test_holdout_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan non_null = df_null.dropna() hold = pbn.HoldOut(df_null) @@ -112,9 +111,9 @@ def test_holdout_null(): combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) assert ( - combination.sort_values("a", axis=0) + combination.sort_values("A", axis=0) .reset_index(drop=True) - .equals(non_null.sort_values("a", axis=0).reset_index(drop=True)) + .equals(non_null.sort_values("A", axis=0).reset_index(drop=True)) ), "The combination of train and test dataset is not equal to the original DataFrame." hold_null = pbn.HoldOut(df_null, include_null=True) @@ -132,7 +131,7 @@ def test_holdout_null(): combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) assert ( - combination.sort_values(["a", "b", "c", "d"], axis=0) + combination.sort_values(["A", "B", "C", "D"], axis=0) .reset_index(drop=True) - .equals(df.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)) + .equals(df.sort_values(["A", "B", "C", "D"], axis=0).reset_index(drop=True)) ), "The combination of train and test dataset is not equal to the original DataFrame." diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index 2137e483..e16fd207 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -1,14 +1,13 @@ import numpy as np import pandas as pd import pyarrow as pa +import pybnesian as pbn import pytest from scipy.stats import gaussian_kde from scipy.stats import multivariate_normal as mvn from scipy.stats import norm from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 SMALL_SIZE = 10 TEST_SIZE = 50 @@ -20,10 +19,10 @@ def test_variable(): for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.CKDE(variable, evidence) assert cpd.variable() == variable @@ -31,17 +30,17 @@ def test_variable(): def test_evidence(): for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.CKDE(variable, evidence) assert cpd.evidence() == evidence def test_kde_data_type(): - k = pbn.CKDE("a", []) + k = pbn.CKDE("A", []) with pytest.raises(ValueError) as ex: k.data_type() @@ -64,10 +63,10 @@ def _test_ckde_kde_joint_iter(variable, evidence, _df): ), "kde_joint do not return a reference to the KDE joint, but a copy." for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_kde_joint_iter(variable, evidence, df) _test_ckde_kde_joint_iter(variable, evidence, df_float) @@ -90,10 +89,10 @@ def _test_ckde_kde_marg_iter(variable, evidence, _df): pass for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_kde_marg_iter(variable, evidence, df) _test_ckde_kde_marg_iter(variable, evidence, df_float) @@ -125,10 +124,10 @@ def _test_ckde_fit(variables, _df, instances): assert cpd.num_instances() == instances for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: variables = [variable] + evidence for instances in [50, 1000, 10000]: @@ -171,22 +170,22 @@ def _test_ckde_fit_null(variable, evidence, variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: variables = [variable] + evidence for instances in [50, 1000, 10000]: @@ -315,27 +314,27 @@ def _test_ckde_logl(variable, evidence, _df, _test_df): test_df_float = test_df.astype("float32") for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_logl(variable, evidence, df, test_df) _test_ckde_logl(variable, evidence, df_small, test_df) _test_ckde_logl(variable, evidence, df_float, test_df_float) _test_ckde_logl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) assert np.all( np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) ), "Order of evidence changes logl() result." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) @@ -369,31 +368,31 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_logl_null(variable, evidence, df, df_null) _test_ckde_logl_null(variable, evidence, df_small, df_null) _test_ckde_logl_null(variable, evidence, df_float, df_null_float) _test_ckde_logl_null(variable, evidence, df_small_float, df_null_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) ll = cpd.logl(df_null) @@ -402,9 +401,9 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): np.isclose(ll, ll2, equal_nan=True) ), "Order of evidence changes the position of nan values." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) ll = cpd.logl(df_null_float) @@ -436,27 +435,27 @@ def _test_ckde_slogl(variable, evidence, _df, _test_df): test_df_float = test_df.astype("float32") for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_slogl(variable, evidence, df, test_df) _test_ckde_slogl(variable, evidence, df_small, test_df) _test_ckde_slogl(variable, evidence, df_float, test_df_float) _test_ckde_slogl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) @@ -491,39 +490,39 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_slogl_null(variable, evidence, df, df_null) _test_ckde_slogl_null(variable, evidence, df_small, df_null) _test_ckde_slogl_null(variable, evidence, df_float, df_null_float) _test_ckde_slogl_null(variable, evidence, df_small_float, df_null_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) @@ -550,27 +549,27 @@ def _test_ckde_cdf(variable, evidence, _df, _test_df): test_df_float = test_df.astype("float32") for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_cdf(variable, evidence, df, test_df) _test_ckde_cdf(variable, evidence, df_small, test_df) _test_ckde_cdf(variable, evidence, df_float, test_df_float) _test_ckde_cdf(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) assert np.all( np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df)) ), "Order of evidence changes logl() result." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.cdf(test_df_float), cpd2.cdf(test_df_float), atol=0.0005) @@ -604,39 +603,39 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: _test_ckde_cdf_null(variable, evidence, df, df_null) _test_ckde_cdf_null(variable, evidence, df_small, df_null) _test_ckde_cdf_null(variable, evidence, df_float, df_null_float) _test_ckde_cdf_null(variable, evidence, df_small_float, df_null_float) - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) assert np.all( np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True) ), "Order of evidence changes cdf() result." - cpd = pbn.CKDE("d", ["a", "b", "c"]) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE("d", ["c", "b", "a"]) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) assert np.all( np.isclose( @@ -648,7 +647,7 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): def test_ckde_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.CKDE("a", []) + cpd = pbn.CKDE("A", []) cpd.fit(df) sampled = cpd.sample(SAMPLE_SIZE, None, 0) @@ -656,27 +655,27 @@ def test_ckde_sample(): assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE("b", ["a"]) + cpd = pbn.CKDE("B", ["A"]) cpd.fit(df) - sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE("c", ["a", "b"]) + cpd = pbn.CKDE("C", ["A", "B"]) cpd.fit(df) sampling_df = pd.DataFrame( - {"a": np.full((SAMPLE_SIZE,), 3.0), "b": np.full((SAMPLE_SIZE,), 7.45)} + {"A": np.full((SAMPLE_SIZE,), 3.0), "B": np.full((SAMPLE_SIZE,), 7.45)} ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE("a", []) + cpd = pbn.CKDE("A", []) cpd.fit(df_float) sampled = cpd.sample(SAMPLE_SIZE, None, 0) @@ -684,22 +683,22 @@ def test_ckde_sample(): assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE("b", ["a"]) + cpd = pbn.CKDE("B", ["A"]) cpd.fit(df_float) - sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE("c", ["a", "b"]) + cpd = pbn.CKDE("C", ["A", "B"]) cpd.fit(df_float) sampling_df = pd.DataFrame( { - "a": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), - "b": np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32), + "A": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), + "B": np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32), } ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index 2d03284d..70629ed2 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -1,19 +1,18 @@ import numpy as np import pyarrow as pa +import pybnesian as pbn import pytest +from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde from util_test import generate_normal_data -import pybnesian as pbn -from pybnesian import BandwidthSelector - SIZE = 500 df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") def test_check_type(): - cpd = pbn.KDE(["a"]) + cpd = pbn.KDE(["A"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -32,13 +31,13 @@ def test_check_type(): def test_kde_variables(): - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: cpd = pbn.KDE(variables) assert cpd.variables() == variables def test_kde_bandwidth(): - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 1000, 10000]: npdata = df.loc[:, variables].to_numpy() # Test normal reference rule @@ -72,7 +71,7 @@ def test_kde_bandwidth(): np.isclose(cpd.bandwidth, scipy_kde.covariance) ), "Wrong bandwidth computed with Scott's rule." - cpd = pbn.KDE(["a"]) + cpd = pbn.KDE(["A"]) cpd.fit(df) cpd.bandwidth = [[1]] assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." @@ -91,14 +90,14 @@ def bandwidth(self, df, variables): def test_kde_new_bandwidth(): - kde = pbn.KDE(["a"], UnitaryBandwidth()) + kde = pbn.KDE(["A"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.eye(1) kde.fit(df_float) assert kde.bandwidth == np.eye(1) - kde = pbn.KDE(["a", "b", "c", "d"], UnitaryBandwidth()) + kde = pbn.KDE(["A", "B", "C", "D"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.eye(4)) @@ -107,7 +106,7 @@ def test_kde_new_bandwidth(): def test_kde_data_type(): - k = pbn.KDE(["a"]) + k = pbn.KDE(["A"]) with pytest.raises(ValueError) as ex: k.data_type() @@ -136,7 +135,7 @@ def _test_kde_fit_iter(variables, _df, instances): assert scipy_kde.n == cpd.num_instances(), "Wrong number of training instances." assert scipy_kde.d == cpd.num_variables(), "Wrong number of training variables." - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_kde_fit_iter(variables, df, instances) _test_kde_fit_iter(variables, df_float, instances) @@ -177,18 +176,18 @@ def _test_kde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_kde_fit_null_iter(variables, df_null, instances) _test_kde_fit_null_iter(variables, df_null_float, instances) @@ -243,21 +242,21 @@ def _test_kde_logl_iter(variables, _df, _test_df): test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_logl_iter(variables, df, test_df) _test_kde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) ), "Order of evidence changes logl() result." - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float)) @@ -324,32 +323,32 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_logl_null_iter(variables, df, df_null) _test_kde_logl_null_iter(variables, df_float, df_null_float) - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) ), "Order of evidence changes logl() result." - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose( @@ -391,21 +390,21 @@ def _test_kde_slogl_iter(variables, _df, _test_df): test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_slogl_iter(variables, df, test_df) _test_kde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) @@ -456,32 +455,32 @@ def _test_kde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_slogl_null_iter(variables, df, df_null) _test_kde_slogl_null_iter(variables, df_float, df_null_float) - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(["d", "a", "b", "c"]) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(["a", "c", "d", "b"]) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 4b74f056..eec2b982 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -1,11 +1,10 @@ import numpy as np import pandas as pd import pyarrow as pa +import pybnesian as pbn from scipy.stats import norm from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 df = generate_normal_data(SIZE) @@ -13,10 +12,10 @@ def test_lg_variable(): for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.variable() == variable @@ -24,10 +23,10 @@ def test_lg_variable(): def test_lg_evidence(): for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.evidence() == evidence @@ -44,16 +43,16 @@ def fit_numpy(_df, variable, evidence): def test_lg_data_type(): - cpd = pbn.LinearGaussianCPD("a", []) + cpd = pbn.LinearGaussianCPD("A", []) assert cpd.data_type() == pa.float64() def test_lg_fit(): for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() @@ -74,16 +73,16 @@ def test_lg_fit_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() @@ -138,10 +137,10 @@ def test_lg_logl(): test_df = generate_normal_data(5000) for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -162,9 +161,9 @@ def test_lg_logl(): + ")" ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -182,16 +181,16 @@ def test_lg_logl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -213,9 +212,9 @@ def test_lg_logl_null(): + ") with null values." ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -227,10 +226,10 @@ def test_lg_slogl(): test_df = generate_normal_data(5000) for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -251,9 +250,9 @@ def test_lg_slogl(): + ")" ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -271,16 +270,16 @@ def test_lg_slogl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -301,9 +300,9 @@ def test_lg_slogl_null(): + ") with null values." ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -315,10 +314,10 @@ def test_lg_cdf(): test_df = generate_normal_data(5000) for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -338,9 +337,9 @@ def test_lg_cdf(): + ")" ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -358,16 +357,16 @@ def test_lg_cdf_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan for variable, evidence in [ - ("a", []), - ("b", ["a"]), - ("c", ["a", "b"]), - ("d", ["a", "b", "c"]), + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) @@ -389,9 +388,9 @@ def test_lg_cdf_null(): + ") with null values." ) - cpd = pbn.LinearGaussianCPD("d", ["a", "b", "c"]) + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD("d", ["c", "a", "b"]) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) assert np.all( @@ -402,7 +401,7 @@ def test_lg_cdf_null(): def test_lg_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.LinearGaussianCPD("a", []) + cpd = pbn.LinearGaussianCPD("A", []) cpd.fit(df) sampled = cpd.sample(SAMPLE_SIZE, None, 0) @@ -410,20 +409,20 @@ def test_lg_sample(): assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.LinearGaussianCPD("b", ["a"]) + cpd = pbn.LinearGaussianCPD("B", ["A"]) cpd.fit(df) - sampling_df = pd.DataFrame({"a": np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.LinearGaussianCPD("c", ["a", "b"]) + cpd = pbn.LinearGaussianCPD("C", ["A", "B"]) cpd.fit(df) sampling_df = pd.DataFrame( - {"a": np.full((SAMPLE_SIZE,), 3.0), "b": np.full((SAMPLE_SIZE,), 7.45)} + {"A": np.full((SAMPLE_SIZE,), 3.0), "B": np.full((SAMPLE_SIZE,), 7.45)} ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 0920a5ad..847a8891 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -1,19 +1,18 @@ import numpy as np import pyarrow as pa +import pybnesian as pbn import pytest +from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde from util_test import generate_normal_data -import pybnesian as pbn -from pybnesian import BandwidthSelector - SIZE = 500 df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") def test_check_type(): - cpd = pbn.ProductKDE(["a"]) + cpd = pbn.ProductKDE(["A"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -32,7 +31,7 @@ def test_check_type(): def test_productkde_variables(): - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: cpd = pbn.ProductKDE(variables) assert cpd.variables() == variables @@ -63,7 +62,7 @@ def py_scott_bandwidth(df, variables): def test_productkde_bandwidth(): # for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: - for variables in [["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: cpd = pbn.ProductKDE(variables) cpd.fit(df.iloc[:instances]) @@ -95,7 +94,7 @@ def test_productkde_bandwidth(): ) ), "Wrong bandwidth computed with Scott's rule." - cpd = pbn.ProductKDE(["a"]) + cpd = pbn.ProductKDE(["A"]) cpd.fit(df) cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." @@ -114,14 +113,14 @@ def diag_bandwidth(self, df, variables): def test_productkde_new_bandwidth(): - kde = pbn.ProductKDE(["a"], UnitaryBandwidth()) + kde = pbn.ProductKDE(["A"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.ones((1,)) kde.fit(df_float) assert kde.bandwidth == np.ones((1,)) - kde = pbn.ProductKDE(["a", "b", "c", "d"], UnitaryBandwidth()) + kde = pbn.ProductKDE(["A", "B", "C", "D"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.ones((4,))) @@ -130,7 +129,7 @@ def test_productkde_new_bandwidth(): def test_productkde_data_type(): - k = pbn.ProductKDE(["a"]) + k = pbn.ProductKDE(["A"]) with pytest.raises(ValueError) as ex: k.data_type() @@ -168,7 +167,7 @@ def _test_productkde_fit_iter(variables, _df, instances): ) ), "Wrong bandwidth." - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_productkde_fit_iter(variables, df, instances) _test_productkde_fit_iter(variables, df_float, instances) @@ -218,18 +217,18 @@ def _test_productkde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_productkde_fit_null_iter(variables, df_null, instances) _test_productkde_fit_null_iter(variables, df_null_float, instances) @@ -294,21 +293,21 @@ def _test_productkde_logl_iter(variables, _df, _test_df): test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_logl_iter(variables, df, test_df) _test_productkde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) @@ -367,32 +366,32 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_logl_null_iter(variables, df, df_null) _test_productkde_logl_null_iter(variables, df_float, df_null_float) - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose( @@ -450,21 +449,21 @@ def _test_productkde_slogl_iter(variables, _df, _test_df): test_df = generate_normal_data(50, seed=1) test_df_float = test_df.astype("float32") - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_slogl_iter(variables, df, test_df) _test_productkde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005) @@ -526,32 +525,32 @@ def _test_productkde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], "a"] = np.nan - df_null_float.loc[df_null_float.index[b_null], "b"] = np.nan - df_null_float.loc[df_null_float.index[c_null], "c"] = np.nan - df_null_float.loc[df_null_float.index[d_null], "d"] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [["a"], ["b", "a"], ["c", "a", "b"], ["d", "a", "b", "c"]]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_slogl_null_iter(variables, df, df_null) _test_productkde_slogl_null_iter(variables, df_float, df_null_float) - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) assert np.all( np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(["d", "a", "b", "c"]) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(["a", "c", "d", "b"]) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) assert np.all( np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float), atol=0.0005) diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index f274d90e..798e3772 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -1,12 +1,11 @@ import numpy as np import pandas as pd import pyarrow as pa -import pytest -from util_test import generate_discrete_data_dependent - import pybnesian as pbn +import pytest +from util_test import generate_discrete_data -df = generate_discrete_data_dependent(10000) +df = generate_discrete_data(10000) def test_data_type(): @@ -15,7 +14,7 @@ def test_data_type(): a.data_type() assert "DiscreteFactor factor not fitted." in str(ex.value) - categories = np.asarray(["a1", "a2"]) + categories = np.asarray(["A1", "A2"]) a_values = pd.Categorical( categories[np.random.randint(len(categories), size=100)], categories=categories, @@ -25,7 +24,7 @@ def test_data_type(): a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) - categories = np.asarray(["a" + str(i) for i in range(1, 129)]) + categories = np.asarray(["A" + str(i) for i in range(1, 129)]) a_values = pd.Categorical( categories[np.random.randint(len(categories), size=100)], categories=categories, @@ -35,7 +34,7 @@ def test_data_type(): a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) - categories = np.asarray(["a" + str(i) for i in range(1, 130)]) + categories = np.asarray(["A" + str(i) for i in range(1, 130)]) a_values = pd.Categorical( categories[np.random.randint(len(categories), size=100)], categories=categories, diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index a52f678a..3a5905d4 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -1,31 +1,30 @@ -import pytest - import pybnesian as pbn +import pytest from pybnesian import Factor, FactorType def test_factor_type(): - lg1 = pbn.LinearGaussianCPD("a", []) - lg2 = pbn.LinearGaussianCPD("b", ["a"]) - lg3 = pbn.LinearGaussianCPD("c", ["b", "a"]) + lg1 = pbn.LinearGaussianCPD("A", []) + lg2 = pbn.LinearGaussianCPD("B", ["A"]) + lg3 = pbn.LinearGaussianCPD("C", ["B", "A"]) assert lg1.type() == pbn.LinearGaussianCPDType() assert lg1.type() == lg2.type() assert lg1.type() == lg3.type() assert lg2.type() == lg3.type() - c1 = pbn.CKDE("a", []) - c2 = pbn.CKDE("b", ["a"]) - c3 = pbn.CKDE("c", ["b", "a"]) + c1 = pbn.CKDE("A", []) + c2 = pbn.CKDE("B", ["A"]) + c3 = pbn.CKDE("C", ["B", "A"]) assert c1.type() == pbn.CKDEType() assert c1.type() == c2.type() assert c1.type() == c3.type() assert c2.type() == c3.type() - d1 = pbn.DiscreteFactor("a", []) - d2 = pbn.DiscreteFactor("b", ["a"]) - d3 = pbn.DiscreteFactor("c", ["b", "a"]) + d1 = pbn.DiscreteFactor("A", []) + d2 = pbn.DiscreteFactor("B", ["A"]) + d3 = pbn.DiscreteFactor("C", ["B", "A"]) assert d1.type() == pbn.DiscreteFactorType() assert d1.type() == d2.type() @@ -80,9 +79,9 @@ def __init__(self, variable, evidence): def type(self): return F_type() - f1 = F("a", []) - f2 = F("b", ["a"]) - f3 = F("c", ["a", "b"]) + f1 = F("A", []) + f2 = F("B", ["A"]) + f3 = F("C", ["A", "B"]) assert f1.type() == f2.type() assert f1.type() == f3.type() @@ -90,9 +89,9 @@ def type(self): assert str(f1.type()) == str(f2.type()) == str(f3.type()) == "FType" - dummy_network = pbn.GaussianNetwork(["a", "b", "c", "d"]) + dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) with pytest.raises(RuntimeError) as ex: - f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) + f1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( ex.value ) @@ -114,9 +113,9 @@ def __init__(self, variable, evidence): def type(self): return G_type() - g1 = G("a", []) - g2 = G("b", ["a"]) - g3 = G("c", ["a", "b"]) + g1 = G("A", []) + g2 = G("B", ["A"]) + g3 = G("C", ["A", "B"]) assert g1.type() == g2.type() assert g1.type() == g3.type() @@ -126,8 +125,8 @@ def type(self): assert str(g1.type()) == str(g2.type()) == str(g3.type()) == "GType" - g4 = g1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) + g4 = g1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) assert g1.type() == g4.type() - assert g4.variable() == "d" - assert g4.evidence() == ["a", "b", "c"] + assert g4.variable() == "D" + assert g4.evidence() == ["A", "B", "C"] diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 8e46928d..537a8f45 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,8 +1,7 @@ import numpy as np -from util_test import generate_normal_data - import pybnesian as pbn from pybnesian import BayesianNetwork, BayesianNetworkType +from util_test import generate_normal_data df = generate_normal_data(1000) # TODO: Add tests for normal data with dependencies @@ -15,11 +14,11 @@ def test_hc_estimate(): start = pbn.GaussianNetwork(column_names) # Check algorithm with BN with nodes removed. - column_names.insert(1, "e") - column_names.insert(3, "f") + column_names.insert(1, "E") + column_names.insert(3, "F") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node("e") - start_removed_nodes.remove_node("f") + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_node("F") arc_set = pbn.ArcOperatorSet() @@ -85,12 +84,12 @@ def test_hc_conditional_estimate(): start = pbn.ConditionalGaussianNetwork(column_names[2:], column_names[:2]) nodes = column_names[2:] - nodes.insert(1, "e") + nodes.insert(1, "E") interface_nodes = column_names[:2] - interface_nodes.insert(1, "f") + interface_nodes.insert(1, "F") start_removed_nodes = pbn.ConditionalGaussianNetwork(nodes, interface_nodes) - start_removed_nodes.remove_node("e") - start_removed_nodes.remove_interface_node("f") + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_interface_node("F") arc_set = pbn.ArcOperatorSet() hc = pbn.GreedyHillClimbing() @@ -138,11 +137,11 @@ def test_hc_estimate_validation(): column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) - column_names.insert(1, "e") - column_names.insert(4, "f") + column_names.insert(1, "E") + column_names.insert(4, "F") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node("e") - start_removed_nodes.remove_node("f") + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_node("F") vl = pbn.ValidatedLikelihood(df) arc_set = pbn.ArcOperatorSet() @@ -271,7 +270,7 @@ def default_node_type(self): return pbn.LinearGaussianCPDType() def can_have_arc(self, model, source, target): - return "a" in source + return "A" in source def new_bn(self, nodes): return NewBN(nodes) @@ -299,7 +298,7 @@ def __setstate_extra__(self, extra): def test_newbn_estimate_validation(): - start = NewBN(["a", "b", "c", "d"]) + start = NewBN(["A", "B", "C", "D"]) hc = pbn.GreedyHillClimbing() arc = pbn.ArcOperatorSet() bic = pbn.BIC(df) diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 570e2b34..9b74e71f 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,8 +1,7 @@ +import pybnesian as pbn import pytest from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 df = generate_normal_data(SIZE) @@ -20,7 +19,7 @@ def test_create(): def test_find_max(): - spbn = pbn.SemiparametricBN(["a", "b", "c", "d"]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) cv = pbn.CVLikelihood(df) arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() diff --git a/tests/learning/operators/operators_test.py b/tests/learning/operators/operators_test.py index bb6e8919..cc634e48 100644 --- a/tests/learning/operators/operators_test.py +++ b/tests/learning/operators/operators_test.py @@ -1,109 +1,108 @@ -import pytest - import pybnesian as pbn +import pytest def test_create(): - o = pbn.AddArc("a", "b", 1) - assert o.source() == "a" - assert o.target() == "b" + o = pbn.AddArc("A", "B", 1) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 1 - o = pbn.RemoveArc("a", "b", 2) - assert o.source() == "a" - assert o.target() == "b" + o = pbn.RemoveArc("A", "B", 2) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 2 - o = pbn.FlipArc("a", "b", 3) - assert o.source() == "a" - assert o.target() == "b" + o = pbn.FlipArc("A", "B", 3) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 3 - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 4) - assert o.node() == "a" + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 4) + assert o.node() == "A" assert o.node_type() == pbn.CKDEType() assert o.delta() == 4 def test_apply(): - gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_arcs() == 0 - assert not gbn.has_arc("a", "b") + assert not gbn.has_arc("A", "B") - o = pbn.AddArc("a", "b", 1) + o = pbn.AddArc("A", "B", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert gbn.has_arc("a", "b") + assert gbn.has_arc("A", "B") - o = pbn.FlipArc("a", "b", 1) + o = pbn.FlipArc("A", "B", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert not gbn.has_arc("a", "b") - assert gbn.has_arc("b", "a") + assert not gbn.has_arc("A", "B") + assert gbn.has_arc("B", "A") - o = pbn.RemoveArc("b", "a", 1) + o = pbn.RemoveArc("B", "A", 1) o.apply(gbn) assert gbn.num_arcs() == 0 - assert not gbn.has_arc("b", "a") + assert not gbn.has_arc("B", "A") - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) with pytest.raises(ValueError) as ex: o.apply(gbn) assert "Wrong factor type" in str(ex.value) - spbn = pbn.SemiparametricBN(["a", "b", "c", "d"]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_arcs() == 0 - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) - assert spbn.node_type("a") == pbn.UnknownFactorType() + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) + assert spbn.node_type("A") == pbn.UnknownFactorType() o.apply(spbn) - assert spbn.node_type("a") == pbn.CKDEType() + assert spbn.node_type("A") == pbn.CKDEType() - assert not spbn.has_arc("a", "b") - o = pbn.AddArc("a", "b", 1) + assert not spbn.has_arc("A", "B") + o = pbn.AddArc("A", "B", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert spbn.has_arc("a", "b") + assert spbn.has_arc("A", "B") - o = pbn.FlipArc("a", "b", 1) + o = pbn.FlipArc("A", "B", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert not spbn.has_arc("a", "b") - assert spbn.has_arc("b", "a") + assert not spbn.has_arc("A", "B") + assert spbn.has_arc("B", "A") - o = pbn.RemoveArc("b", "a", 1) + o = pbn.RemoveArc("B", "A", 1) o.apply(spbn) assert spbn.num_arcs() == 0 - assert not spbn.has_arc("b", "a") + assert not spbn.has_arc("B", "A") def test_opposite(): - bn = pbn.SemiparametricBN(["a", "b"]) - o = pbn.AddArc("a", "b", 1) + bn = pbn.SemiparametricBN(["A", "B"]) + o = pbn.AddArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == "a" - assert oppo.target() == "b" + assert oppo.source() == "A" + assert oppo.target() == "B" assert oppo.delta() == -1 assert type(oppo) == pbn.RemoveArc - o = pbn.RemoveArc("a", "b", 1) + o = pbn.RemoveArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == "a" - assert oppo.target() == "b" + assert oppo.source() == "A" + assert oppo.target() == "B" assert oppo.delta() == -1 assert type(oppo) == pbn.AddArc - o = pbn.FlipArc("a", "b", 1) + o = pbn.FlipArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == "b" - assert oppo.target() == "a" + assert oppo.source() == "B" + assert oppo.target() == "A" assert oppo.delta() == -1 assert type(oppo) == pbn.FlipArc - bn.set_node_type("a", pbn.LinearGaussianCPDType()) - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) + bn.set_node_type("A", pbn.LinearGaussianCPDType()) + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) oppo = o.opposite(bn) - assert oppo.node() == "a" + assert oppo.node() == "A" assert oppo.node_type() == pbn.LinearGaussianCPDType() assert oppo.delta() == -1 assert type(oppo) == pbn.ChangeNodeType diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index db581dd6..768961d7 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,15 +1,14 @@ import numpy as np +import pybnesian as pbn import pytest from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 df = generate_normal_data(SIZE) def test_create_change_node(): - gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) cv = pbn.CVLikelihood(df) @@ -21,24 +20,24 @@ def test_create_change_node(): def test_lists(): - gbn = pbn.GaussianNetwork(["a", "b", "c", "d"]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() - arc_op.set_arc_blacklist([("b", "a")]) - arc_op.set_arc_whitelist([("b", "c")]) + arc_op.set_arc_blacklist([("B", "A")]) + arc_op.set_arc_whitelist([("B", "C")]) arc_op.set_max_indegree(3) - arc_op.set_type_whitelist([("a", pbn.LinearGaussianCPDType())]) + arc_op.set_type_whitelist([("A", pbn.LinearGaussianCPDType())]) arc_op.cache_scores(gbn, bic) - arc_op.set_arc_blacklist([("e", "a")]) + arc_op.set_arc_blacklist([("E", "A")]) with pytest.raises(ValueError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value) - arc_op.set_arc_whitelist([("e", "a")]) + arc_op.set_arc_whitelist([("E", "A")]) with pytest.raises(ValueError) as ex: arc_op.cache_scores(gbn, bic) @@ -46,7 +45,7 @@ def test_lists(): def test_check_max_score(): - gbn = pbn.GaussianNetwork(["c", "d"]) + gbn = pbn.GaussianNetwork(["C", "D"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() @@ -55,7 +54,7 @@ def test_check_max_score(): op = arc_op.find_max(gbn) assert np.isclose( - op.delta(), (bic.local_score(gbn, "d", ["c"]) - bic.local_score(gbn, "d")) + op.delta(), (bic.local_score(gbn, "D", ["C"]) - bic.local_score(gbn, "D")) ) # BIC is decomposable so the best operation is the arc in reverse direction. @@ -70,10 +69,10 @@ def test_check_max_score(): def test_nomax(): - gbn = pbn.GaussianNetwork(["a", "b"]) + gbn = pbn.GaussianNetwork(["A", "B"]) bic = pbn.BIC(df) - arc_op = pbn.ArcOperatorSet(whitelist=[("a", "b")]) + arc_op = pbn.ArcOperatorSet(whitelist=[("A", "B")]) arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) diff --git a/tests/learning/operators/operatorstabuset_test.py b/tests/learning/operators/operatorstabuset_test.py index be7bdfa0..b91daca1 100644 --- a/tests/learning/operators/operatorstabuset_test.py +++ b/tests/learning/operators/operatorstabuset_test.py @@ -6,18 +6,18 @@ def test_OperatorTabuSet(): assert tabu_set.empty() - assert not tabu_set.contains(pbn.AddArc("a", "b", 1)) - tabu_set.insert(pbn.AddArc("a", "b", 2)) + assert not tabu_set.contains(pbn.AddArc("A", "B", 1)) + tabu_set.insert(pbn.AddArc("A", "B", 2)) assert not tabu_set.empty() - assert tabu_set.contains(pbn.AddArc("a", "b", 3)) + assert tabu_set.contains(pbn.AddArc("A", "B", 3)) - assert not tabu_set.contains(pbn.RemoveArc("b", "c", 4)) - tabu_set.insert(pbn.RemoveArc("b", "c", 5)) - assert tabu_set.contains(pbn.RemoveArc("b", "c", 6)) + assert not tabu_set.contains(pbn.RemoveArc("B", "C", 4)) + tabu_set.insert(pbn.RemoveArc("B", "C", 5)) + assert tabu_set.contains(pbn.RemoveArc("B", "C", 6)) - assert not tabu_set.contains(pbn.FlipArc("c", "d", 7)) - tabu_set.insert(pbn.RemoveArc("c", "d", 8)) - assert tabu_set.contains(pbn.RemoveArc("c", "d", 9)) + assert not tabu_set.contains(pbn.FlipArc("C", "D", 7)) + tabu_set.insert(pbn.RemoveArc("C", "D", 8)) + assert tabu_set.contains(pbn.RemoveArc("C", "D", 9)) tabu_set.clear() assert tabu_set.empty() diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 0ca091ef..74d25076 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -40,22 +40,22 @@ def test_mle_create(): def test_mle_lg(): mle = pbn.MLE(pbn.LinearGaussianCPDType()) - p = mle.estimate(df, "a", []) - np_beta, np_var = numpy_fit_mle_lg(df, "a", []) + p = mle.estimate(df, "A", []) + np_beta, np_var = numpy_fit_mle_lg(df, "A", []) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "b", ["a"]) - np_beta, np_var = numpy_fit_mle_lg(df, "b", ["a"]) + p = mle.estimate(df, "B", ["A"]) + np_beta, np_var = numpy_fit_mle_lg(df, "B", ["A"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "c", ["a", "b"]) - np_beta, np_var = numpy_fit_mle_lg(df, "c", ["a", "b"]) + p = mle.estimate(df, "C", ["A", "B"]) + np_beta, np_var = numpy_fit_mle_lg(df, "C", ["A", "B"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "d", ["a", "b", "c"]) - np_beta, np_var = numpy_fit_mle_lg(df, "d", ["a", "b", "c"]) + p = mle.estimate(df, "D", ["A", "B", "C"]) + np_beta, np_var = numpy_fit_mle_lg(df, "D", ["A", "B", "C"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 5c103013..28f38927 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,9 +1,8 @@ import numpy as np +import pybnesian as pbn from scipy.stats import norm from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 10000 df = generate_normal_data(SIZE) @@ -35,38 +34,38 @@ def numpy_local_score(data, variable, evidence): def test_bic_local_score(): gbn = pbn.GaussianNetwork( - ["a", "b", "c", "d"], - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ["A", "B", "C", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], ) bic = pbn.BIC(df) - assert np.isclose(bic.local_score(gbn, "a", []), numpy_local_score(df, "a", [])) + assert np.isclose(bic.local_score(gbn, "A", []), numpy_local_score(df, "A", [])) assert np.isclose( - bic.local_score(gbn, "b", ["a"]), numpy_local_score(df, "b", ["a"]) + bic.local_score(gbn, "B", ["A"]), numpy_local_score(df, "B", ["A"]) ) assert np.isclose( - bic.local_score(gbn, "c", ["a", "b"]), numpy_local_score(df, "c", ["a", "b"]) + bic.local_score(gbn, "C", ["A", "B"]), numpy_local_score(df, "C", ["A", "B"]) ) assert np.isclose( - bic.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(df, "d", ["a", "b", "c"]), + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df, "D", ["A", "B", "C"]), ) assert np.isclose( - bic.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(df, "d", ["b", "c", "a"]), + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df, "D", ["B", "C", "A"]), ) - assert bic.local_score(gbn, "a") == bic.local_score(gbn, "a", gbn.parents("a")) - assert bic.local_score(gbn, "b") == bic.local_score(gbn, "b", gbn.parents("b")) - assert bic.local_score(gbn, "c") == bic.local_score(gbn, "c", gbn.parents("c")) - assert bic.local_score(gbn, "d") == bic.local_score(gbn, "d", gbn.parents("d")) + assert bic.local_score(gbn, "A") == bic.local_score(gbn, "A", gbn.parents("A")) + assert bic.local_score(gbn, "B") == bic.local_score(gbn, "B", gbn.parents("B")) + assert bic.local_score(gbn, "C") == bic.local_score(gbn, "C", gbn.parents("C")) + assert bic.local_score(gbn, "D") == bic.local_score(gbn, "D", gbn.parents("D")) def test_bic_local_score_null(): gbn = pbn.GaussianNetwork( - ["a", "b", "c", "d"], - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ["A", "B", "C", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], ) np.random.seed(0) @@ -76,41 +75,41 @@ def test_bic_local_score_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan bic = pbn.BIC(df_null) assert np.isclose( - bic.local_score(gbn, "a", []), numpy_local_score(df_null, "a", []) + bic.local_score(gbn, "A", []), numpy_local_score(df_null, "A", []) ) assert np.isclose( - bic.local_score(gbn, "b", ["a"]), numpy_local_score(df_null, "b", ["a"]) + bic.local_score(gbn, "B", ["A"]), numpy_local_score(df_null, "B", ["A"]) ) assert np.isclose( - bic.local_score(gbn, "c", ["a", "b"]), - numpy_local_score(df_null, "c", ["a", "b"]), + bic.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(df_null, "C", ["A", "B"]), ) assert np.isclose( - bic.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(df_null, "d", ["a", "b", "c"]), + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df_null, "D", ["A", "B", "C"]), ) assert np.isclose( - bic.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(df_null, "d", ["b", "c", "a"]), + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df_null, "D", ["B", "C", "A"]), ) - assert bic.local_score(gbn, "a") == bic.local_score(gbn, "a", gbn.parents("a")) - assert bic.local_score(gbn, "b") == bic.local_score(gbn, "b", gbn.parents("b")) - assert bic.local_score(gbn, "c") == bic.local_score(gbn, "c", gbn.parents("c")) - assert bic.local_score(gbn, "d") == bic.local_score(gbn, "d", gbn.parents("d")) + assert bic.local_score(gbn, "A") == bic.local_score(gbn, "A", gbn.parents("A")) + assert bic.local_score(gbn, "B") == bic.local_score(gbn, "B", gbn.parents("B")) + assert bic.local_score(gbn, "C") == bic.local_score(gbn, "C", gbn.parents("C")) + assert bic.local_score(gbn, "D") == bic.local_score(gbn, "D", gbn.parents("D")) def test_bic_score(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) bic = pbn.BIC(df) @@ -118,9 +117,9 @@ def test_bic_score(): assert np.isclose( bic.score(gbn), ( - bic.local_score(gbn, "a", []) - + bic.local_score(gbn, "b", ["a"]) - + bic.local_score(gbn, "c", ["a", "b"]) - + bic.local_score(gbn, "d", ["a", "b", "c"]) + bic.local_score(gbn, "A", []) + + bic.local_score(gbn, "B", ["A"]) + + bic.local_score(gbn, "C", ["A", "B"]) + + bic.local_score(gbn, "D", ["A", "B", "C"]) ), ) diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index e8ac3b69..99eebbd0 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -1,11 +1,10 @@ import numpy as np import pandas as pd +import pybnesian as pbn import pytest from scipy.stats import gaussian_kde, norm from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 1000 df = generate_normal_data(SIZE) @@ -83,41 +82,41 @@ def test_cvl_create(): def test_cvl_local_score_gbn(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose( - cvl.local_score(gbn, "a", []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "a", []), + cvl.local_score(gbn, "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "A", []), ) assert np.isclose( - cvl.local_score(gbn, "b", ["a"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "b", ["a"]), + cvl.local_score(gbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "B", ["A"]), ) assert np.isclose( - cvl.local_score(gbn, "c", ["a", "b"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "c", ["a", "b"]), + cvl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["a", "b", "c"]), + cvl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score(gbn, "d", ["a", "b", "c"]), - cvl.local_score(gbn, "d", ["b", "c", "a"]), + cvl.local_score(gbn, "D", ["A", "B", "C"]), + cvl.local_score(gbn, "D", ["B", "C", "A"]), ) - assert cvl.local_score(gbn, "a") == cvl.local_score(gbn, "a", gbn.parents("a")) - assert cvl.local_score(gbn, "b") == cvl.local_score(gbn, "b", gbn.parents("b")) - assert cvl.local_score(gbn, "c") == cvl.local_score(gbn, "c", gbn.parents("c")) - assert cvl.local_score(gbn, "d") == cvl.local_score(gbn, "d", gbn.parents("d")) + assert cvl.local_score(gbn, "A") == cvl.local_score(gbn, "A", gbn.parents("A")) + assert cvl.local_score(gbn, "B") == cvl.local_score(gbn, "B", gbn.parents("B")) + assert cvl.local_score(gbn, "C") == cvl.local_score(gbn, "C", gbn.parents("C")) + assert cvl.local_score(gbn, "D") == cvl.local_score(gbn, "D", gbn.parents("D")) def test_cvl_local_score_gbn_null(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) np.random.seed(0) @@ -127,99 +126,99 @@ def test_cvl_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose( - cvl.local_score(gbn, "a", []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "a", []), + cvl.local_score(gbn, "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "A", []), ) assert np.isclose( - cvl.local_score(gbn, "b", ["a"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "b", ["a"]), + cvl.local_score(gbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "B", ["A"]), ) assert np.isclose( - cvl.local_score(gbn, "c", ["a", "b"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "c", ["a", "b"]), + cvl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score(gbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["a", "b", "c"]), + cvl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score(gbn, "d", ["a", "b", "c"]), - cvl.local_score(gbn, "d", ["b", "c", "a"]), + cvl.local_score(gbn, "D", ["A", "B", "C"]), + cvl.local_score(gbn, "D", ["B", "C", "A"]), ) - assert cvl.local_score(gbn, "a") == cvl.local_score(gbn, "a", gbn.parents("a")) - assert cvl.local_score(gbn, "b") == cvl.local_score(gbn, "b", gbn.parents("b")) - assert cvl.local_score(gbn, "c") == cvl.local_score(gbn, "c", gbn.parents("c")) - assert cvl.local_score(gbn, "d") == cvl.local_score(gbn, "d", gbn.parents("d")) + assert cvl.local_score(gbn, "A") == cvl.local_score(gbn, "A", gbn.parents("A")) + assert cvl.local_score(gbn, "B") == cvl.local_score(gbn, "B", gbn.parents("B")) + assert cvl.local_score(gbn, "C") == cvl.local_score(gbn, "C", gbn.parents("C")) + assert cvl.local_score(gbn, "D") == cvl.local_score(gbn, "D", gbn.parents("D")) def test_cvl_local_score_spbn(): spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose( - cvl.local_score(spbn, "a", []), numpy_local_score(pbn.CKDEType(), df, "a", []) + cvl.local_score(spbn, "A", []), numpy_local_score(pbn.CKDEType(), df, "A", []) ) assert np.isclose( - cvl.local_score(spbn, "b", ["a"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "b", ["a"]), + cvl.local_score(spbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "B", ["A"]), ) assert np.isclose( - cvl.local_score(spbn, "c", ["a", "b"]), - numpy_local_score(pbn.CKDEType(), df, "c", ["a", "b"]), + cvl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score(pbn.CKDEType(), df, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score(spbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["a", "b", "c"]), + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score(spbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "d", ["b", "c", "a"]), + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["B", "C", "A"]), ) - assert cvl.local_score(spbn, "a") == cvl.local_score(spbn, "a", spbn.parents("a")) - assert cvl.local_score(spbn, "b") == cvl.local_score(spbn, "b", spbn.parents("b")) - assert cvl.local_score(spbn, "c") == cvl.local_score(spbn, "c", spbn.parents("c")) - assert cvl.local_score(spbn, "d") == cvl.local_score(spbn, "d", spbn.parents("d")) + assert cvl.local_score(spbn, "A") == cvl.local_score(spbn, "A", spbn.parents("A")) + assert cvl.local_score(spbn, "B") == cvl.local_score(spbn, "B", spbn.parents("B")) + assert cvl.local_score(spbn, "C") == cvl.local_score(spbn, "C", spbn.parents("C")) + assert cvl.local_score(spbn, "D") == cvl.local_score(spbn, "D", spbn.parents("D")) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "a", []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "a", []), + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "A", []), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "b", ["a"]), - numpy_local_score(pbn.CKDEType(), df, "b", ["a"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "B", ["A"]), + numpy_local_score(pbn.CKDEType(), df, "B", ["A"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "c", ["a", "b"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df, "c", ["a", "b"]), + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), - numpy_local_score(pbn.CKDEType(), df, "d", ["a", "b", "c"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), - numpy_local_score(pbn.CKDEType(), df, "d", ["b", "c", "a"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df, "D", ["B", "C", "A"]), ) def test_cvl_local_score_null_spbn(): spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) np.random.seed(0) @@ -229,64 +228,64 @@ def test_cvl_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose( - cvl.local_score(spbn, "a", []), - numpy_local_score(pbn.CKDEType(), df_null, "a", []), + cvl.local_score(spbn, "A", []), + numpy_local_score(pbn.CKDEType(), df_null, "A", []), ) assert np.isclose( - cvl.local_score(spbn, "b", ["a"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "b", ["a"]), + cvl.local_score(spbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "B", ["A"]), ) assert np.isclose( - cvl.local_score(spbn, "c", ["a", "b"]), - numpy_local_score(pbn.CKDEType(), df_null, "c", ["a", "b"]), + cvl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score(pbn.CKDEType(), df_null, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score(spbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["a", "b", "c"]), + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score(spbn, "d", ["a", "b", "c"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "d", ["b", "c", "a"]), + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["B", "C", "A"]), ) - assert cvl.local_score(spbn, "a") == cvl.local_score(spbn, "a", spbn.parents("a")) - assert cvl.local_score(spbn, "b") == cvl.local_score(spbn, "b", spbn.parents("b")) - assert cvl.local_score(spbn, "c") == cvl.local_score(spbn, "c", spbn.parents("c")) - assert cvl.local_score(spbn, "d") == cvl.local_score(spbn, "d", spbn.parents("d")) + assert cvl.local_score(spbn, "A") == cvl.local_score(spbn, "A", spbn.parents("A")) + assert cvl.local_score(spbn, "B") == cvl.local_score(spbn, "B", spbn.parents("B")) + assert cvl.local_score(spbn, "C") == cvl.local_score(spbn, "C", spbn.parents("C")) + assert cvl.local_score(spbn, "D") == cvl.local_score(spbn, "D", spbn.parents("D")) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "a", []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "a", []), + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "A", []), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "b", ["a"]), - numpy_local_score(pbn.CKDEType(), df_null, "b", ["a"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "B", ["A"]), + numpy_local_score(pbn.CKDEType(), df_null, "B", ["A"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "c", ["a", "b"]), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "c", ["a", "b"]), + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "C", ["A", "B"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), - numpy_local_score(pbn.CKDEType(), df_null, "d", ["a", "b", "c"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df_null, "D", ["A", "B", "C"]), ) assert np.isclose( - cvl.local_score_node_type(spbn, pbn.CKDEType(), "d", ["a", "b", "c"]), - numpy_local_score(pbn.CKDEType(), df_null, "d", ["b", "c", "a"]), + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df_null, "D", ["B", "C", "A"]), ) def test_cvl_score(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) cv = pbn.CVLikelihood(df, 10, 0) @@ -294,24 +293,24 @@ def test_cvl_score(): assert np.isclose( cv.score(gbn), ( - cv.local_score(gbn, "a", []) - + cv.local_score(gbn, "b", ["a"]) - + cv.local_score(gbn, "c", ["a", "b"]) - + cv.local_score(gbn, "d", ["a", "b", "c"]) + cv.local_score(gbn, "A", []) + + cv.local_score(gbn, "B", ["A"]) + + cv.local_score(gbn, "C", ["A", "B"]) + + cv.local_score(gbn, "D", ["A", "B", "C"]) ), ) spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert np.isclose( cv.score(spbn), ( - cv.local_score(spbn, "a") - + cv.local_score(spbn, "b") - + cv.local_score(spbn, "c") - + cv.local_score(spbn, "d") + cv.local_score(spbn, "A") + + cv.local_score(spbn, "B") + + cv.local_score(spbn, "C") + + cv.local_score(spbn, "D") ), ) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 6f064f17..dc2ae24f 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -1,11 +1,10 @@ import numpy as np import pandas as pd +import pybnesian as pbn import pytest from scipy.stats import gaussian_kde, norm from util_test import generate_normal_data -import pybnesian as pbn - SIZE = 1000 df = generate_normal_data(SIZE) seed = 0 @@ -84,65 +83,65 @@ def test_holdout_create(): def test_holdout_local_score_gbn(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( - hl.local_score(gbn, "a", []), + hl.local_score(gbn, "A", []), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "a", + "A", [], ), ) assert np.isclose( - hl.local_score(gbn, "b", ["a"]), + hl.local_score(gbn, "B", ["A"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "b", - ["a"], + "B", + ["A"], ), ) assert np.isclose( - hl.local_score(gbn, "c", ["a", "b"]), + hl.local_score(gbn, "C", ["A", "B"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "c", - ["a", "b"], + "C", + ["A", "B"], ), ) assert np.isclose( - hl.local_score(gbn, "d", ["a", "b", "c"]), + hl.local_score(gbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["a", "b", "c"], + "D", + ["A", "B", "C"], ), ) assert np.isclose( - hl.local_score(gbn, "d", ["a", "b", "c"]), - hl.local_score(gbn, "d", ["b", "c", "a"]), + hl.local_score(gbn, "D", ["A", "B", "C"]), + hl.local_score(gbn, "D", ["B", "C", "A"]), ) - assert hl.local_score(gbn, "a") == hl.local_score(gbn, "a", gbn.parents("a")) - assert hl.local_score(gbn, "b") == hl.local_score(gbn, "b", gbn.parents("b")) - assert hl.local_score(gbn, "c") == hl.local_score(gbn, "c", gbn.parents("c")) - assert hl.local_score(gbn, "d") == hl.local_score(gbn, "d", gbn.parents("d")) + assert hl.local_score(gbn, "A") == hl.local_score(gbn, "A", gbn.parents("A")) + assert hl.local_score(gbn, "B") == hl.local_score(gbn, "B", gbn.parents("B")) + assert hl.local_score(gbn, "C") == hl.local_score(gbn, "C", gbn.parents("C")) + assert hl.local_score(gbn, "D") == hl.local_score(gbn, "D", gbn.parents("D")) def test_holdout_local_score_gbn_null(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) np.random.seed(0) @@ -152,133 +151,133 @@ def test_holdout_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) assert np.isclose( - hl.local_score(gbn, "a", []), + hl.local_score(gbn, "A", []), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "a", + "A", [], ), ) assert np.isclose( - hl.local_score(gbn, "b", ["a"]), + hl.local_score(gbn, "B", ["A"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "b", - ["a"], + "B", + ["A"], ), ) assert np.isclose( - hl.local_score(gbn, "c", ["a", "b"]), + hl.local_score(gbn, "C", ["A", "B"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "c", - ["a", "b"], + "C", + ["A", "B"], ), ) assert np.isclose( - hl.local_score(gbn, "d", ["a", "b", "c"]), + hl.local_score(gbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["a", "b", "c"], + "D", + ["A", "B", "C"], ), ) assert np.isclose( - hl.local_score(gbn, "d", ["a", "b", "c"]), - hl.local_score(gbn, "d", ["b", "c", "a"]), + hl.local_score(gbn, "D", ["A", "B", "C"]), + hl.local_score(gbn, "D", ["B", "C", "A"]), ) - assert hl.local_score(gbn, "a") == hl.local_score(gbn, "a", gbn.parents("a")) - assert hl.local_score(gbn, "b") == hl.local_score(gbn, "b", gbn.parents("b")) - assert hl.local_score(gbn, "c") == hl.local_score(gbn, "c", gbn.parents("c")) - assert hl.local_score(gbn, "d") == hl.local_score(gbn, "d", gbn.parents("d")) + assert hl.local_score(gbn, "A") == hl.local_score(gbn, "A", gbn.parents("A")) + assert hl.local_score(gbn, "B") == hl.local_score(gbn, "B", gbn.parents("B")) + assert hl.local_score(gbn, "C") == hl.local_score(gbn, "C", gbn.parents("C")) + assert hl.local_score(gbn, "D") == hl.local_score(gbn, "D", gbn.parents("D")) def test_holdout_local_score_spbn(): spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( - hl.local_score(spbn, "a", []), + hl.local_score(spbn, "A", []), numpy_local_score( pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "a", + "A", [], ), ) assert np.isclose( - hl.local_score(spbn, "b", ["a"]), + hl.local_score(spbn, "B", ["A"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "b", - ["a"], + "B", + ["A"], ), ) assert np.isclose( - hl.local_score(spbn, "c", ["a", "b"]), + hl.local_score(spbn, "C", ["A", "B"]), numpy_local_score( pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "c", - ["a", "b"], + "C", + ["A", "B"], ), ) assert np.isclose( - hl.local_score(spbn, "d", ["a", "b", "c"]), + hl.local_score(spbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["a", "b", "c"], + "D", + ["A", "B", "C"], ), ) assert np.isclose( - hl.local_score(spbn, "d", ["a", "b", "c"]), + hl.local_score(spbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["b", "c", "a"], + "D", + ["B", "C", "A"], ), ) - assert hl.local_score(spbn, "a") == hl.local_score(spbn, "a", spbn.parents("a")) - assert hl.local_score(spbn, "b") == hl.local_score(spbn, "b", spbn.parents("b")) - assert hl.local_score(spbn, "c") == hl.local_score(spbn, "c", spbn.parents("c")) - assert hl.local_score(spbn, "d") == hl.local_score(spbn, "d", spbn.parents("d")) + assert hl.local_score(spbn, "A") == hl.local_score(spbn, "A", spbn.parents("A")) + assert hl.local_score(spbn, "B") == hl.local_score(spbn, "B", spbn.parents("B")) + assert hl.local_score(spbn, "C") == hl.local_score(spbn, "C", spbn.parents("C")) + assert hl.local_score(spbn, "D") == hl.local_score(spbn, "D", spbn.parents("D")) def test_holdout_local_score_null_spbn(): spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) np.random.seed(0) @@ -288,73 +287,73 @@ def test_holdout_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], "a"] = np.nan - df_null.loc[df_null.index[b_null], "b"] = np.nan - df_null.loc[df_null.index[c_null], "c"] = np.nan - df_null.loc[df_null.index[d_null], "d"] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) assert np.isclose( - hl.local_score(spbn, "a", []), + hl.local_score(spbn, "A", []), numpy_local_score( pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "a", + "A", [], ), ) assert np.isclose( - hl.local_score(spbn, "b", ["a"]), + hl.local_score(spbn, "B", ["A"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "b", - ["a"], + "B", + ["A"], ), ) assert np.isclose( - hl.local_score(spbn, "c", ["a", "b"]), + hl.local_score(spbn, "C", ["A", "B"]), numpy_local_score( pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "c", - ["a", "b"], + "C", + ["A", "B"], ), ) assert np.isclose( - hl.local_score(spbn, "d", ["a", "b", "c"]), + hl.local_score(spbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["a", "b", "c"], + "D", + ["A", "B", "C"], ), ) assert np.isclose( - hl.local_score(spbn, "d", ["a", "b", "c"]), + hl.local_score(spbn, "D", ["A", "B", "C"]), numpy_local_score( pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), - "d", - ["b", "c", "a"], + "D", + ["B", "C", "A"], ), ) - assert hl.local_score(spbn, "a") == hl.local_score(spbn, "a", spbn.parents("a")) - assert hl.local_score(spbn, "b") == hl.local_score(spbn, "b", spbn.parents("b")) - assert hl.local_score(spbn, "c") == hl.local_score(spbn, "c", spbn.parents("c")) - assert hl.local_score(spbn, "d") == hl.local_score(spbn, "d", spbn.parents("d")) + assert hl.local_score(spbn, "A") == hl.local_score(spbn, "A", spbn.parents("A")) + assert hl.local_score(spbn, "B") == hl.local_score(spbn, "B", spbn.parents("B")) + assert hl.local_score(spbn, "C") == hl.local_score(spbn, "C", spbn.parents("C")) + assert hl.local_score(spbn, "D") == hl.local_score(spbn, "D", spbn.parents("D")) def test_holdout_score(): gbn = pbn.GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) hl = pbn.HoldoutLikelihood(df, 0.2, 0) @@ -362,24 +361,24 @@ def test_holdout_score(): assert np.isclose( hl.score(gbn), ( - hl.local_score(gbn, "a", []) - + hl.local_score(gbn, "b", ["a"]) - + hl.local_score(gbn, "c", ["a", "b"]) - + hl.local_score(gbn, "d", ["a", "b", "c"]) + hl.local_score(gbn, "A", []) + + hl.local_score(gbn, "B", ["A"]) + + hl.local_score(gbn, "C", ["A", "B"]) + + hl.local_score(gbn, "D", ["A", "B", "C"]) ), ) spbn = pbn.SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert np.isclose( hl.score(spbn), ( - hl.local_score(spbn, "a") - + hl.local_score(spbn, "b") - + hl.local_score(spbn, "c") - + hl.local_score(spbn, "d") + hl.local_score(spbn, "A") + + hl.local_score(spbn, "B") + + hl.local_score(spbn, "C") + + hl.local_score(spbn, "D") ), ) diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index 68767db3..b28d187a 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,62 +1,61 @@ import numpy as np -import pytest -from util_test import generate_normal_data - import pybnesian as pbn +import pytest from pybnesian import BayesianNetwork, GaussianNetwork +from util_test import generate_normal_data df = generate_normal_data(10000) def test_create_bn(): - gbn = GaussianNetwork(["a", "b", "c", "d"]) + gbn = GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 0 - assert gbn.nodes() == ["a", "b", "c", "d"] + assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c")]) + gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "C")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 1 - assert gbn.nodes() == ["a", "b", "c", "d"] + assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) + gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 3 - assert gbn.nodes() == ["a", "c", "b", "d"] + assert gbn.nodes() == ["A", "C", "B", "D"] with pytest.raises(TypeError) as ex: - gbn = GaussianNetwork(["a", "b", "c"], [("a", "c", "b")]) + gbn = GaussianNetwork(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: - gbn = GaussianNetwork(["a", "b", "c"], [("a", "d")]) + gbn = GaussianNetwork(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork([("a", "b"), ("b", "c"), ("c", "a")]) + gbn = GaussianNetwork([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: gbn = GaussianNetwork( - ["a", "b", "c", "d"], [("a", "b"), ("b", "c"), ("c", "a")] + ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: gbn = BayesianNetwork( - pbn.GaussianNetworkType(), ["a", "b", "c", "d"], [], [("a", pbn.CKDEType())] + pbn.GaussianNetworkType(), ["A", "B", "C", "D"], [], [("A", pbn.CKDEType())] ) assert "Wrong factor type" in str(ex.value) def gbn_generator(): # Test different Networks created with different constructors. - gbn = GaussianNetwork(["a", "b", "c", "d"]) + gbn = GaussianNetwork(["A", "B", "C", "D"]) yield gbn - gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) + gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) yield gbn - gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "b"), ("b", "c")]) + gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) yield gbn @@ -67,171 +66,171 @@ def test_nodes_util(): nodes = gbn.nodes() indices = gbn.indices() - assert nodes[gbn.index("a")] == "a" - assert nodes[gbn.index("b")] == "b" - assert nodes[gbn.index("c")] == "c" - assert nodes[gbn.index("d")] == "d" + assert nodes[gbn.index("A")] == "A" + assert nodes[gbn.index("B")] == "B" + assert nodes[gbn.index("C")] == "C" + assert nodes[gbn.index("D")] == "D" assert indices[gbn.name(0)] == 0 assert indices[gbn.name(1)] == 1 assert indices[gbn.name(2)] == 2 assert indices[gbn.name(3)] == 3 - assert gbn.contains_node("a") - assert gbn.contains_node("b") - assert gbn.contains_node("c") - assert gbn.contains_node("d") - assert not gbn.contains_node("e") + assert gbn.contains_node("A") + assert gbn.contains_node("B") + assert gbn.contains_node("C") + assert gbn.contains_node("D") + assert not gbn.contains_node("E") def test_parent_children(): - gbn = GaussianNetwork(["a", "b", "c", "d"]) + gbn = GaussianNetwork(["A", "B", "C", "D"]) - assert gbn.num_parents("a") == 0 - assert gbn.num_parents("b") == 0 - assert gbn.num_parents("c") == 0 - assert gbn.num_parents("d") == 0 + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 0 + assert gbn.num_parents("C") == 0 + assert gbn.num_parents("D") == 0 - assert gbn.parents("a") == [] - assert gbn.parents("b") == [] - assert gbn.parents("c") == [] - assert gbn.parents("d") == [] + assert gbn.parents("A") == [] + assert gbn.parents("B") == [] + assert gbn.parents("C") == [] + assert gbn.parents("D") == [] - assert gbn.num_children("a") == 0 - assert gbn.num_children("b") == 0 - assert gbn.num_children("c") == 0 - assert gbn.num_children("d") == 0 + assert gbn.num_children("A") == 0 + assert gbn.num_children("B") == 0 + assert gbn.num_children("C") == 0 + assert gbn.num_children("D") == 0 - gbn = GaussianNetwork([("a", "c"), ("b", "d"), ("c", "d")]) + gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) - assert gbn.num_parents("a") == 0 - assert gbn.num_parents("b") == 0 - assert gbn.num_parents("c") == 1 - assert gbn.num_parents("d") == 2 + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 0 + assert gbn.num_parents("C") == 1 + assert gbn.num_parents("D") == 2 - assert gbn.parents("a") == [] - assert gbn.parents("b") == [] - assert gbn.parents("c") == ["a"] - assert set(gbn.parents("d")) == set(["b", "c"]) + assert gbn.parents("A") == [] + assert gbn.parents("B") == [] + assert gbn.parents("C") == ["A"] + assert set(gbn.parents("D")) == set(["B", "C"]) - assert gbn.num_children("a") == 1 - assert gbn.num_children("b") == 1 - assert gbn.num_children("c") == 1 - assert gbn.num_children("d") == 0 + assert gbn.num_children("A") == 1 + assert gbn.num_children("B") == 1 + assert gbn.num_children("C") == 1 + assert gbn.num_children("D") == 0 - gbn = GaussianNetwork(["a", "b", "c", "d"], [("a", "b"), ("b", "c")]) + gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) - assert gbn.num_parents("a") == 0 - assert gbn.num_parents("b") == 1 - assert gbn.num_parents("c") == 1 - assert gbn.num_parents("d") == 0 + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 1 + assert gbn.num_parents("C") == 1 + assert gbn.num_parents("D") == 0 - assert gbn.parents("a") == [] - assert gbn.parents("b") == ["a"] - assert gbn.parents("c") == ["b"] - assert gbn.parents("d") == [] + assert gbn.parents("A") == [] + assert gbn.parents("B") == ["A"] + assert gbn.parents("C") == ["B"] + assert gbn.parents("D") == [] - assert gbn.num_children("a") == 1 - assert gbn.num_children("b") == 1 - assert gbn.num_children("c") == 0 - assert gbn.num_children("d") == 0 + assert gbn.num_children("A") == 1 + assert gbn.num_children("B") == 1 + assert gbn.num_children("C") == 0 + assert gbn.num_children("D") == 0 def test_arcs(): - gbn = GaussianNetwork(["a", "b", "c", "d"]) + gbn = GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_arcs() == 0 assert gbn.arcs() == [] - assert not gbn.has_arc("a", "b") + assert not gbn.has_arc("A", "B") - gbn.add_arc("a", "b") + gbn.add_arc("A", "B") assert gbn.num_arcs() == 1 - assert gbn.arcs() == [("a", "b")] - assert gbn.parents("b") == ["a"] - assert gbn.num_parents("b") == 1 - assert gbn.num_children("a") == 1 - assert gbn.has_arc("a", "b") + assert gbn.arcs() == [("A", "B")] + assert gbn.parents("B") == ["A"] + assert gbn.num_parents("B") == 1 + assert gbn.num_children("A") == 1 + assert gbn.has_arc("A", "B") - gbn.add_arc("b", "c") + gbn.add_arc("B", "C") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([("a", "b"), ("b", "c")]) - assert gbn.parents("c") == ["b"] - assert gbn.num_parents("c") == 1 - assert gbn.num_children("b") == 1 - assert gbn.has_arc("b", "c") + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C")]) + assert gbn.parents("C") == ["B"] + assert gbn.num_parents("C") == 1 + assert gbn.num_children("B") == 1 + assert gbn.has_arc("B", "C") - gbn.add_arc("d", "c") + gbn.add_arc("D", "C") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c")]) - assert set(gbn.parents("c")) == set(["b", "d"]) - assert gbn.num_parents("c") == 2 - assert gbn.num_children("d") == 1 - assert gbn.has_arc("d", "c") - - assert gbn.has_path("a", "c") - assert not gbn.has_path("a", "d") - assert gbn.has_path("b", "c") - assert gbn.has_path("d", "c") - - assert not gbn.can_add_arc("c", "a") + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C")]) + assert set(gbn.parents("C")) == set(["B", "D"]) + assert gbn.num_parents("C") == 2 + assert gbn.num_children("D") == 1 + assert gbn.has_arc("D", "C") + + assert gbn.has_path("A", "C") + assert not gbn.has_path("A", "D") + assert gbn.has_path("B", "C") + assert gbn.has_path("D", "C") + + assert not gbn.can_add_arc("C", "A") # This edge exists, but virtually we consider that the addition is allowed. - assert gbn.can_add_arc("b", "c") - assert gbn.can_add_arc("d", "a") + assert gbn.can_add_arc("B", "C") + assert gbn.can_add_arc("D", "A") - gbn.add_arc("b", "d") + gbn.add_arc("B", "D") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c"), ("b", "d")]) - assert gbn.parents("d") == ["b"] - assert gbn.num_parents("d") == 1 - assert gbn.num_children("b") == 2 - assert gbn.has_arc("b", "d") - - assert gbn.has_path("a", "d") - assert not gbn.can_add_arc("d", "a") - assert not gbn.can_flip_arc("b", "c") - assert gbn.can_flip_arc("a", "b") + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C"), ("B", "D")]) + assert gbn.parents("D") == ["B"] + assert gbn.num_parents("D") == 1 + assert gbn.num_children("B") == 2 + assert gbn.has_arc("B", "D") + + assert gbn.has_path("A", "D") + assert not gbn.can_add_arc("D", "A") + assert not gbn.can_flip_arc("B", "C") + assert gbn.can_flip_arc("A", "B") # This edge does not exist, but it could be flipped if it did. - assert gbn.can_flip_arc("d", "a") + assert gbn.can_flip_arc("D", "A") # We can add an edge twice without changes. - gbn.add_arc("b", "d") + gbn.add_arc("B", "D") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([("a", "b"), ("b", "c"), ("d", "c"), ("b", "d")]) - assert gbn.parents("d") == ["b"] - assert gbn.num_parents("d") == 1 - assert gbn.num_children("b") == 2 - assert gbn.has_arc("b", "d") + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C"), ("B", "D")]) + assert gbn.parents("D") == ["B"] + assert gbn.num_parents("D") == 1 + assert gbn.num_children("B") == 2 + assert gbn.has_arc("B", "D") - gbn.remove_arc("b", "c") + gbn.remove_arc("B", "C") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([("a", "b"), ("d", "c"), ("b", "d")]) - assert gbn.parents("c") == ["d"] - assert gbn.num_parents("c") == 1 - assert gbn.num_children("b") == 1 - assert not gbn.has_arc("b", "c") - - assert gbn.can_add_arc("b", "c") - assert not gbn.can_add_arc("c", "b") - assert gbn.has_path("a", "c") - assert gbn.has_path("b", "c") - - gbn.remove_arc("d", "c") + assert set(gbn.arcs()) == set([("A", "B"), ("D", "C"), ("B", "D")]) + assert gbn.parents("C") == ["D"] + assert gbn.num_parents("C") == 1 + assert gbn.num_children("B") == 1 + assert not gbn.has_arc("B", "C") + + assert gbn.can_add_arc("B", "C") + assert not gbn.can_add_arc("C", "B") + assert gbn.has_path("A", "C") + assert gbn.has_path("B", "C") + + gbn.remove_arc("D", "C") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([("a", "b"), ("b", "d")]) - assert gbn.parents("c") == [] - assert gbn.num_parents("c") == 0 - assert gbn.num_children("d") == 0 - assert not gbn.has_arc("d", "c") + assert set(gbn.arcs()) == set([("A", "B"), ("B", "D")]) + assert gbn.parents("C") == [] + assert gbn.num_parents("C") == 0 + assert gbn.num_children("D") == 0 + assert not gbn.has_arc("D", "C") - assert gbn.can_add_arc("b", "c") - assert gbn.can_add_arc("c", "b") - assert not gbn.has_path("a", "c") - assert not gbn.has_path("b", "c") + assert gbn.can_add_arc("B", "C") + assert gbn.can_add_arc("C", "B") + assert not gbn.has_path("A", "C") + assert not gbn.has_path("B", "C") def test_bn_fit(): gbn = GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) with pytest.raises(ValueError) as ex: @@ -248,83 +247,83 @@ def test_bn_fit(): gbn.fit(df) - gbn.remove_arc("a", "b") + gbn.remove_arc("A", "B") - cpd_b = gbn.cpd("b") - assert cpd_b.evidence != gbn.parents("b") + cpd_b = gbn.cpd("B") + assert cpd_b.evidence != gbn.parents("B") gbn.fit(df) - cpd_b = gbn.cpd("b") - assert cpd_b.evidence() == gbn.parents("b") + cpd_b = gbn.cpd("B") + assert cpd_b.evidence() == gbn.parents("B") def test_add_cpds(): gbn = GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("e", [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("E", [])]) assert "variable which is not present" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("a", ["e"])]) + gbn.add_cpds([pbn.LinearGaussianCPD("A", ["E"])]) assert "Evidence variable" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("a", ["b"])]) + gbn.add_cpds([pbn.LinearGaussianCPD("A", ["B"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("b", [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("B", [])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("b", ["c"])]) + gbn.add_cpds([pbn.LinearGaussianCPD("B", ["C"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) - lg = pbn.LinearGaussianCPD("b", ["a"], [2.5, 1.65], 4) + lg = pbn.LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) assert lg.fitted() gbn.add_cpds([lg]) - cpd_b = gbn.cpd("b") - assert cpd_b.variable() == "b" - assert cpd_b.evidence() == ["a"] + cpd_b = gbn.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert cpd_b.fitted() assert np.all(cpd_b.beta == np.asarray([2.5, 1.65])) assert cpd_b.variance == 4 with pytest.raises(ValueError) as ex: - gbn.cpd("a") + gbn.cpd("A") assert ( - 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' + 'CPD of variable "A" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) with pytest.raises(ValueError) as ex: - gbn.cpd("c") + gbn.cpd("C") assert ( - 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' + 'CPD of variable "C" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) with pytest.raises(ValueError) as ex: - gbn.cpd("d") + gbn.cpd("D") assert ( - 'CPD of variable "d" not added. Call add_cpds() or fit() to add the CPD.' + 'CPD of variable "D" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD("e", [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("E", [])]) assert "variable which is not present" in str(ex.value) def test_bn_logl(): gbn = GaussianNetwork( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) gbn.fit(df) @@ -351,19 +350,19 @@ def test_bn_logl(): def test_bn_sample(): gbn = GaussianNetwork( - ["a", "c", "b", "d"], - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], + ["A", "C", "B", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], ) gbn.fit(df) sample = gbn.sample(1000, 0, False) # Not ordered, so topological sort. - assert sample.schema.names == ["a", "b", "c", "d"] + assert sample.schema.names == ["A", "B", "C", "D"] assert sample.num_rows == 1000 sample_ordered = gbn.sample(1000, 0, True) - assert sample_ordered.schema.names == ["a", "c", "b", "d"] + assert sample_ordered.schema.names == ["A", "C", "B", "D"] assert sample_ordered.num_rows == 1000 assert sample.column(0).equals(sample_ordered.column(0)) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 71ae83de..678651a2 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,5 +1,3 @@ -from util_test import generate_normal_data_indep - import pybnesian as pbn from pybnesian import ( BayesianNetwork, @@ -10,39 +8,40 @@ KDENetwork, SemiparametricBN, ) +from util_test import generate_normal_data_independent def test_bn_type(): - g1 = GaussianNetwork(["a", "b", "c", "d"]) - g2 = GaussianNetwork(["a", "b", "c", "d"]) - g3 = GaussianNetwork(["a", "b", "c", "d"]) + g1 = GaussianNetwork(["A", "B", "C", "D"]) + g2 = GaussianNetwork(["A", "B", "C", "D"]) + g3 = GaussianNetwork(["A", "B", "C", "D"]) assert g1.type() == pbn.GaussianNetworkType() assert g1.type() == g2.type() assert g1.type() == g3.type() assert g2.type() == g3.type() - s1 = SemiparametricBN(["a", "b", "c", "d"]) - s2 = SemiparametricBN(["a", "b", "c", "d"]) - s3 = SemiparametricBN(["a", "b", "c", "d"]) + s1 = SemiparametricBN(["A", "B", "C", "D"]) + s2 = SemiparametricBN(["A", "B", "C", "D"]) + s3 = SemiparametricBN(["A", "B", "C", "D"]) assert s1.type() == pbn.SemiparametricBNType() assert s1.type() == s2.type() assert s1.type() == s3.type() assert s2.type() == s3.type() - k1 = KDENetwork(["a", "b", "c", "d"]) - k2 = KDENetwork(["a", "b", "c", "d"]) - k3 = KDENetwork(["a", "b", "c", "d"]) + k1 = KDENetwork(["A", "B", "C", "D"]) + k2 = KDENetwork(["A", "B", "C", "D"]) + k3 = KDENetwork(["A", "B", "C", "D"]) assert k1.type() == pbn.KDENetworkType() assert k1.type() == k2.type() assert k1.type() == k3.type() assert k2.type() == k3.type() - d1 = DiscreteBN(["a", "b", "c", "d"]) - d2 = DiscreteBN(["a", "b", "c", "d"]) - d3 = DiscreteBN(["a", "b", "c", "d"]) + d1 = DiscreteBN(["A", "B", "C", "D"]) + d2 = DiscreteBN(["A", "B", "C", "D"]) + d3 = DiscreteBN(["A", "B", "C", "D"]) assert d1.type() == pbn.DiscreteBNType() assert d1.type() == d2.type() @@ -66,7 +65,7 @@ def is_homogeneous(self): return True def can_have_arc(self, model, source, target): - return source == "a" + return source == "A" a1 = MyGaussianNetworkType() a2 = MyGaussianNetworkType() @@ -90,12 +89,12 @@ def __init__(self): assert a1 != b1 - mybn = BayesianNetwork(a1, ["a", "b", "c", "d"]) + mybn = BayesianNetwork(a1, ["A", "B", "C", "D"]) - # This type omits the arcs that do not have "a" as source. - assert mybn.can_add_arc("a", "b") - assert not mybn.can_add_arc("b", "a") - assert not mybn.can_add_arc("c", "d") + # This type omits the arcs that do not have "A" as source. + assert mybn.can_add_arc("A", "B") + assert not mybn.can_add_arc("B", "A") + assert not mybn.can_add_arc("C", "D") class MyRestrictedGaussianNetworkType(BayesianNetworkType): @@ -109,7 +108,7 @@ def default_node_type(self): return pbn.LinearGaussianCPDType() def can_have_arc(self, model, source, target): - return source == "a" + return source == "A" def __str__(self): return "MyRestrictedGaussianNetworkType" @@ -138,53 +137,53 @@ def __init__(self, variables, interface, arcs=None): def test_new_specific_bn_type(): - sp1 = SpecificNetwork(["a", "b", "c", "d"]) - sp2 = SpecificNetwork(["a", "b", "c", "d"], [("a", "b")]) - sp3 = SpecificNetwork(["a", "b", "c", "d"]) + sp1 = SpecificNetwork(["A", "B", "C", "D"]) + sp2 = SpecificNetwork(["A", "B", "C", "D"], [("A", "B")]) + sp3 = SpecificNetwork(["A", "B", "C", "D"]) assert sp1.type() == sp2.type() assert sp1.type() == sp3.type() assert sp2.type() == sp3.type() - assert sp1.can_add_arc("a", "b") - assert not sp1.can_add_arc("b", "a") - assert not sp1.can_add_arc("c", "d") + assert sp1.can_add_arc("A", "B") + assert not sp1.can_add_arc("B", "A") + assert not sp1.can_add_arc("C", "D") assert sp1.num_arcs() == sp3.num_arcs() == 0 - assert sp2.arcs() == [("a", "b")] + assert sp2.arcs() == [("A", "B")] - df = generate_normal_data_indep(1000) + df = generate_normal_data_independent(1000) bic = pbn.BIC(df) - start = SpecificNetwork(["a", "b", "c", "d"]) + start = SpecificNetwork(["A", "B", "C", "D"]) hc = pbn.GreedyHillClimbing() estimated = hc.estimate(pbn.ArcOperatorSet(), bic, start) assert estimated.type() == start.type() - assert all([s == "a" for s, t in estimated.arcs()]) + assert all([s == "A" for s, t in estimated.arcs()]) # ####################### # Conditional BN # ####################### - csp1 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) - csp2 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"], [("a", "b")]) - csp3 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) + csp1 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"]) + csp2 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"], [("A", "B")]) + csp3 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"]) assert csp1.type() == csp2.type() assert csp1.type() == csp3.type() assert csp2.type() == csp3.type() - assert csp1.can_add_arc("a", "b") - assert not csp1.can_add_arc("b", "a") - assert not csp1.can_add_arc("c", "d") + assert csp1.can_add_arc("A", "B") + assert not csp1.can_add_arc("B", "A") + assert not csp1.can_add_arc("C", "D") assert csp1.num_arcs() == csp3.num_arcs() == 0 - assert csp2.arcs() == [("a", "b")] + assert csp2.arcs() == [("A", "B")] - cstart = ConditionalSpecificNetwork(["a", "c"], ["b", "d"]) + cstart = ConditionalSpecificNetwork(["A", "C"], ["B", "D"]) hc = pbn.GreedyHillClimbing() cestimated = hc.estimate(pbn.ArcOperatorSet(), bic, cstart) assert cestimated.type() == cstart.type() - assert all([s == "a" for s, t in cestimated.arcs()]) + assert all([s == "A" for s, t in cestimated.arcs()]) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 3bcb0e62..786103a5 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -2,26 +2,25 @@ import numpy as np import pandas as pd -import pytest -from scipy.stats import norm -from util_test import generate_normal_data - import pybnesian as pbn +import pytest from pybnesian import ( ConditionalGaussianNetwork, DynamicGaussianNetwork, GaussianNetwork, ) +from scipy.stats import norm +from util_test import generate_normal_data df = generate_normal_data(1000) def test_create_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] gbn = DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 - assert gbn.variables() == ["a", "b", "c", "d"] + assert gbn.variables() == ["A", "B", "C", "D"] assert gbn.num_variables() == 4 assert gbn.type() == pbn.GaussianNetworkType() @@ -37,7 +36,7 @@ def test_create_dbn(): gbn2 = DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) assert gbn2.markovian_order() == 2 - assert gbn2.variables() == ["a", "b", "c", "d"] + assert gbn2.variables() == ["A", "B", "C", "D"] assert gbn2.num_variables() == 4 assert gbn2.type() == pbn.GaussianNetworkType() @@ -56,42 +55,42 @@ def test_create_dbn(): def test_variable_operations_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] gbn = DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 - assert gbn.variables() == ["a", "b", "c", "d"] + assert gbn.variables() == ["A", "B", "C", "D"] assert gbn.num_variables() == 4 - assert gbn.contains_variable("a") - assert gbn.contains_variable("b") - assert gbn.contains_variable("c") - assert gbn.contains_variable("d") + assert gbn.contains_variable("A") + assert gbn.contains_variable("B") + assert gbn.contains_variable("C") + assert gbn.contains_variable("D") - gbn.add_variable("e") - assert set(gbn.variables()) == set(["a", "b", "c", "d", "e"]) + gbn.add_variable("E") + assert set(gbn.variables()) == set(["A", "B", "C", "D", "E"]) assert gbn.num_variables() == 5 assert set(gbn.static_bn().nodes()) == set( - [v + "_t_" + str(m) for v in variables + ["e"] for m in range(1, 3)] + [v + "_t_" + str(m) for v in variables + ["E"] for m in range(1, 3)] ) assert set(gbn.transition_bn().nodes()) == set( - [v + "_t_0" for v in variables + ["e"]] + [v + "_t_0" for v in variables + ["E"]] ) - gbn.remove_variable("b") - assert set(gbn.variables()) == set(["a", "c", "d", "e"]) + gbn.remove_variable("B") + assert set(gbn.variables()) == set(["A", "C", "D", "E"]) assert gbn.num_variables() == 4 assert set(gbn.static_bn().nodes()) == set( - [v + "_t_" + str(m) for v in ["a", "c", "d", "e"] for m in range(1, 3)] + [v + "_t_" + str(m) for v in ["A", "C", "D", "E"] for m in range(1, 3)] ) assert set(gbn.transition_bn().nodes()) == set( - [v + "_t_0" for v in ["a", "c", "d", "e"]] + [v + "_t_0" for v in ["A", "C", "D", "E"]] ) def test_fit_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] gbn = DynamicGaussianNetwork(variables, 2) assert not gbn.fitted() assert not gbn.static_bn().fitted() @@ -171,33 +170,33 @@ def numpy_logl(dbn, test_data): def test_logl_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_bn = GaussianNetwork( - ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) static_bn = GaussianNetwork( - ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) gbn = DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() - static_bn.add_arc("a_t_2", "c_t_2") - static_bn.add_arc("b_t_2", "c_t_2") - static_bn.add_arc("c_t_2", "d_t_2") - static_bn.add_arc("a_t_1", "c_t_1") - static_bn.add_arc("b_t_1", "c_t_1") - static_bn.add_arc("c_t_1", "d_t_1") + static_bn.add_arc("A_t_2", "C_t_2") + static_bn.add_arc("B_t_2", "C_t_2") + static_bn.add_arc("C_t_2", "D_t_2") + static_bn.add_arc("A_t_1", "C_t_1") + static_bn.add_arc("B_t_1", "C_t_1") + static_bn.add_arc("C_t_1", "D_t_1") transition_bn = gbn.transition_bn() - transition_bn.add_arc("a_t_2", "a_t_0") - transition_bn.add_arc("b_t_2", "b_t_0") - transition_bn.add_arc("c_t_2", "c_t_0") - transition_bn.add_arc("d_t_2", "d_t_0") - transition_bn.add_arc("a_t_1", "a_t_0") - transition_bn.add_arc("b_t_1", "b_t_0") - transition_bn.add_arc("c_t_1", "c_t_0") - transition_bn.add_arc("d_t_1", "d_t_0") + transition_bn.add_arc("A_t_2", "A_t_0") + transition_bn.add_arc("B_t_2", "B_t_0") + transition_bn.add_arc("C_t_2", "C_t_0") + transition_bn.add_arc("D_t_2", "D_t_0") + transition_bn.add_arc("A_t_1", "A_t_0") + transition_bn.add_arc("B_t_1", "B_t_0") + transition_bn.add_arc("C_t_1", "C_t_0") + transition_bn.add_arc("D_t_1", "D_t_0") gbn.fit(df) @@ -208,33 +207,33 @@ def test_logl_dbn(): def test_slogl_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_bn = GaussianNetwork( - ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) static_bn = GaussianNetwork( - ["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")] + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) gbn = DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() - static_bn.add_arc("a_t_2", "c_t_2") - static_bn.add_arc("b_t_2", "c_t_2") - static_bn.add_arc("c_t_2", "d_t_2") - static_bn.add_arc("a_t_1", "c_t_1") - static_bn.add_arc("b_t_1", "c_t_1") - static_bn.add_arc("c_t_1", "d_t_1") + static_bn.add_arc("A_t_2", "C_t_2") + static_bn.add_arc("B_t_2", "C_t_2") + static_bn.add_arc("C_t_2", "D_t_2") + static_bn.add_arc("A_t_1", "C_t_1") + static_bn.add_arc("B_t_1", "C_t_1") + static_bn.add_arc("C_t_1", "D_t_1") transition_bn = gbn.transition_bn() - transition_bn.add_arc("a_t_2", "a_t_0") - transition_bn.add_arc("b_t_2", "b_t_0") - transition_bn.add_arc("c_t_2", "c_t_0") - transition_bn.add_arc("d_t_2", "d_t_0") - transition_bn.add_arc("a_t_1", "a_t_0") - transition_bn.add_arc("b_t_1", "b_t_0") - transition_bn.add_arc("c_t_1", "c_t_0") - transition_bn.add_arc("d_t_1", "d_t_0") + transition_bn.add_arc("A_t_2", "A_t_0") + transition_bn.add_arc("B_t_2", "B_t_0") + transition_bn.add_arc("C_t_2", "C_t_0") + transition_bn.add_arc("D_t_2", "D_t_0") + transition_bn.add_arc("A_t_1", "A_t_0") + transition_bn.add_arc("B_t_1", "B_t_0") + transition_bn.add_arc("C_t_1", "C_t_0") + transition_bn.add_arc("D_t_1", "D_t_0") gbn.fit(df) test_df = generate_normal_data(100) diff --git a/tests/models/HeterogeneousBN_test.py b/tests/models/HeterogeneousBN_test.py index 0bd7127b..72ea865e 100644 --- a/tests/models/HeterogeneousBN_test.py +++ b/tests/models/HeterogeneousBN_test.py @@ -1,5 +1,4 @@ import pyarrow as pa - import pybnesian as pbn @@ -9,16 +8,16 @@ def test_type_equality(): # het_single = pbn.HeterogeneousBN( - [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] ) het2_single = pbn.HeterogeneousBN( - [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] ) assert het_single.type() == het2_single.type() het3_single = pbn.HeterogeneousBN( - [pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["a", "b", "c", "d"] + [pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["A", "B", "C", "D"] ) assert het_single.type() != het3_single.type() @@ -33,7 +32,7 @@ def test_type_equality(): pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], }, - ["a", "b", "c", "d"], + ["A", "B", "C", "D"], ) het2_dt = pbn.HeterogeneousBN( @@ -42,7 +41,7 @@ def test_type_equality(): pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], }, - ["a", "b", "c", "d"], + ["A", "B", "C", "D"], ) # The order of the set is not relevant @@ -54,7 +53,7 @@ def test_type_equality(): pa.float32(): [pbn.LinearGaussianCPDType(), pbn.CKDEType()], pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], }, - ["a", "b", "c", "d"], + ["A", "B", "C", "D"], ) # The order of the default FactorTypes is relevant @@ -64,11 +63,11 @@ def test_type_equality(): # Compare single vector and multi vector FactorTypes het_single = pbn.HeterogeneousBN( - [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"] + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] ) het_dt = pbn.HeterogeneousBN( {pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()]}, - ["a", "b", "c", "d"], + ["A", "B", "C", "D"], ) assert het_single.type() != het_dt.type() diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 683f976b..56c4d16b 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,146 +1,145 @@ import numpy as np -import pytest -from util_test import generate_normal_data - import pybnesian as pbn +import pytest from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN +from util_test import generate_normal_data df = generate_normal_data(10000) def test_create_spbn(): - spbn = SemiparametricBN(["a", "b", "c", "d"]) + spbn = SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ["a", "b", "c", "d"] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN(["a", "b", "c", "d"], [("a", "c")]) + spbn = SemiparametricBN(["A", "B", "C", "D"], [("A", "C")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ["a", "b", "c", "d"] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN([("a", "c"), ("b", "d"), ("c", "d")]) + spbn = SemiparametricBN([("A", "C"), ("B", "D"), ("C", "D")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ["a", "c", "b", "d"] + assert spbn.nodes() == ["A", "C", "B", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(["a", "b", "c"], [("a", "c", "b")]) + spbn = SemiparametricBN(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(["a", "b", "c"], [("a", "d")]) + spbn = SemiparametricBN(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([("a", "b"), ("b", "c"), ("c", "a")]) + spbn = SemiparametricBN([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: spbn = SemiparametricBN( - ["a", "b", "c", "d"], [("a", "b"), ("b", "c"), ("c", "a")] + ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] ) assert "must be a DAG" in str(ex.value) expected_node_type = { - "a": pbn.CKDEType(), - "b": pbn.UnknownFactorType(), - "c": pbn.CKDEType(), - "d": pbn.UnknownFactorType(), + "A": pbn.CKDEType(), + "B": pbn.UnknownFactorType(), + "C": pbn.CKDEType(), + "D": pbn.UnknownFactorType(), } spbn = SemiparametricBN( - ["a", "b", "c", "d"], [("a", pbn.CKDEType()), ("c", pbn.CKDEType())] + ["A", "B", "C", "D"], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())] ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ["a", "b", "c", "d"] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] spbn = SemiparametricBN( - ["a", "b", "c", "d"], - [("a", "c")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ["A", "B", "C", "D"], + [("A", "C")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ["a", "b", "c", "d"] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] spbn = SemiparametricBN( - [("a", "c"), ("b", "d"), ("c", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ["a", "c", "b", "d"] + assert spbn.nodes() == ["A", "C", "B", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] with pytest.raises(TypeError) as ex: spbn = SemiparametricBN( - ["a", "b", "c"], - [("a", "c", "b")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ["A", "B", "C"], + [("A", "C", "B")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: spbn = SemiparametricBN( - ["a", "b", "c"], - [("a", "d")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ["A", "B", "C"], + [("A", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: spbn = SemiparametricBN( - [("a", "b"), ("b", "c"), ("c", "a")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + [("A", "B"), ("B", "C"), ("C", "A")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: spbn = SemiparametricBN( - ["a", "b", "c", "d"], - [("a", "b"), ("b", "c"), ("c", "a")], - [("a", pbn.CKDEType()), ("c", pbn.CKDEType())], + ["A", "B", "C", "D"], + [("A", "B"), ("B", "C"), ("C", "A")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert "must be a DAG" in str(ex.value) def test_node_type(): - spbn = SemiparametricBN(["a", "b", "c", "d"]) + spbn = SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ["a", "b", "c", "d"] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn.set_node_type("b", pbn.CKDEType()) - assert spbn.node_type("b") == pbn.CKDEType() - spbn.set_node_type("b", pbn.LinearGaussianCPDType()) - assert spbn.node_type("b") == pbn.LinearGaussianCPDType() + spbn.set_node_type("B", pbn.CKDEType()) + assert spbn.node_type("B") == pbn.CKDEType() + spbn.set_node_type("B", pbn.LinearGaussianCPDType()) + assert spbn.node_type("B") == pbn.LinearGaussianCPDType() def test_fit(): spbn = SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) with pytest.raises(ValueError) as ex: @@ -160,95 +159,95 @@ def test_fit(): spbn.fit(df) - spbn.remove_arc("a", "b") + spbn.remove_arc("A", "B") - cpd_b = spbn.cpd("b") + cpd_b = spbn.cpd("B") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence != spbn.parents("b") + assert cpd_b.evidence != spbn.parents("B") spbn.fit(df) - cpd_b = spbn.cpd("b") + cpd_b = spbn.cpd("B") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence() == spbn.parents("b") + assert cpd_b.evidence() == spbn.parents("B") - spbn.set_node_type("c", pbn.CKDEType()) + spbn.set_node_type("C", pbn.CKDEType()) with pytest.raises(ValueError) as ex: - cpd_c = spbn.cpd("c") + cpd_c = spbn.cpd("C") assert "not added" in str(ex.value) spbn.fit(df) - cpd_c = spbn.cpd("c") - assert cpd_c.type() == spbn.node_type("c") + cpd_c = spbn.cpd("C") + assert cpd_c.type() == spbn.node_type("C") def test_cpd(): spbn = SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("d", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("D", pbn.CKDEType())], ) with pytest.raises(ValueError) as ex: - spbn.cpd("a") + spbn.cpd("A") assert "not added" in str(ex.value) spbn.fit(df) - assert spbn.cpd("a").type() == pbn.LinearGaussianCPDType() - assert spbn.cpd("b").type() == pbn.LinearGaussianCPDType() - assert spbn.cpd("c").type() == pbn.LinearGaussianCPDType() - assert spbn.cpd("d").type() == pbn.CKDEType() + assert spbn.cpd("A").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("B").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("C").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("D").type() == pbn.CKDEType() - assert spbn.cpd("a").fitted() - assert spbn.cpd("b").fitted() - assert spbn.cpd("c").fitted() - assert spbn.cpd("d").fitted() + assert spbn.cpd("A").fitted() + assert spbn.cpd("B").fitted() + assert spbn.cpd("C").fitted() + assert spbn.cpd("D").fitted() def test_add_cpds(): spbn = SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")], - [("d", pbn.CKDEType())], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("D", pbn.CKDEType())], ) - assert spbn.node_type("a") == pbn.UnknownFactorType() - spbn.add_cpds([CKDE("a", [])]) - assert spbn.node_type("a") == pbn.CKDEType() + assert spbn.node_type("A") == pbn.UnknownFactorType() + spbn.add_cpds([CKDE("A", [])]) + assert spbn.node_type("A") == pbn.CKDEType() with pytest.raises(ValueError) as ex: - spbn.add_cpds([LinearGaussianCPD("d", ["a", "b", "c"])]) + spbn.add_cpds([LinearGaussianCPD("D", ["A", "B", "C"])]) assert "Bayesian network expects type" in str(ex.value) - lg = LinearGaussianCPD("b", ["a"], [2.5, 1.65], 4) - ckde = CKDE("d", ["a", "b", "c"]) + lg = LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) + ckde = CKDE("D", ["A", "B", "C"]) assert lg.fitted() assert not ckde.fitted() spbn.add_cpds([lg, ckde]) - spbn.set_node_type("a", pbn.UnknownFactorType()) + spbn.set_node_type("A", pbn.UnknownFactorType()) with pytest.raises(ValueError) as ex: - spbn.cpd("a").fitted() + spbn.cpd("A").fitted() assert ( - 'CPD of variable "a" not added. Call add_cpds() or fit() to add the CPD.' + 'CPD of variable "A" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) - assert spbn.cpd("b").fitted() + assert spbn.cpd("B").fitted() with pytest.raises(ValueError) as ex: - spbn.cpd("c").fitted() + spbn.cpd("C").fitted() assert ( - 'CPD of variable "c" not added. Call add_cpds() or fit() to add the CPD.' + 'CPD of variable "C" not added. Call add_cpds() or fit() to add the CPD.' in str(ex.value) ) - assert not spbn.cpd("d").fitted() + assert not spbn.cpd("D").fitted() def test_logl(): spbn = SemiparametricBN( - [("a", "b"), ("a", "c"), ("a", "d"), ("b", "c"), ("b", "d"), ("c", "d")] + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) spbn.fit(df) diff --git a/tests/serialization/serialize_factor_test.py b/tests/serialization/serialize_factor_test.py index 0e3a6fea..d6f17f0d 100644 --- a/tests/serialization/serialize_factor_test.py +++ b/tests/serialization/serialize_factor_test.py @@ -2,27 +2,26 @@ import numpy as np import pandas as pd -import pytest - import pybnesian as pbn +import pytest from pybnesian import CKDE, DiscreteFactor, Factor, FactorType, LinearGaussianCPD @pytest.fixture def lg_bytes(): - lg = LinearGaussianCPD("c", ["a", "b"]) + lg = LinearGaussianCPD("C", ["A", "B"]) return pickle.dumps(lg) @pytest.fixture def ckde_bytes(): - ckde = CKDE("c", ["a", "b"]) + ckde = CKDE("C", ["A", "B"]) return pickle.dumps(ckde) @pytest.fixture def discrete_bytes(): - discrete = DiscreteFactor("c", ["a", "b"]) + discrete = DiscreteFactor("C", ["A", "B"]) return pickle.dumps(discrete) @@ -102,13 +101,13 @@ def __setstate__(self, d): @pytest.fixture def new_bytes(): - n = NewFactor("c", ["a", "b"]) + n = NewFactor("C", ["A", "B"]) return pickle.dumps(n) @pytest.fixture def newbis_bytes(): - n = NewFactorBis("c", ["a", "b"]) + n = NewFactorBis("C", ["A", "B"]) return pickle.dumps(n) @@ -116,44 +115,44 @@ def test_serialization_unfitted_factor( lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes ): loaded_lg = pickle.loads(lg_bytes) - assert loaded_lg.variable() == "c" - assert set(loaded_lg.evidence()) == set(["a", "b"]) + assert loaded_lg.variable() == "C" + assert set(loaded_lg.evidence()) == set(["A", "B"]) assert not loaded_lg.fitted() assert loaded_lg.type() == pbn.LinearGaussianCPDType() loaded_ckde = pickle.loads(ckde_bytes) - assert loaded_ckde.variable() == "c" - assert set(loaded_ckde.evidence()) == set(["a", "b"]) + assert loaded_ckde.variable() == "C" + assert set(loaded_ckde.evidence()) == set(["A", "B"]) assert not loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() loaded_discrete = pickle.loads(discrete_bytes) - assert loaded_discrete.variable() == "c" - assert set(loaded_discrete.evidence()) == set(["a", "b"]) + assert loaded_discrete.variable() == "C" + assert set(loaded_discrete.evidence()) == set(["A", "B"]) assert not loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() loaded_new = pickle.loads(new_bytes) - assert loaded_new.variable() == "c" - assert set(loaded_new.evidence()) == set(["a", "b"]) + assert loaded_new.variable() == "C" + assert set(loaded_new.evidence()) == set(["A", "B"]) assert not loaded_new.fitted() assert type(loaded_new.type()) == NewType - nn = NewFactor("a", []) + nn = NewFactor("A", []) assert loaded_new.type() == nn.type() from pybnesian import GaussianNetwork - dummy_network = GaussianNetwork(["a", "b", "c", "d"]) - assert type(loaded_new.type().new_factor(dummy_network, "a", [])) == NewFactor + dummy_network = GaussianNetwork(["A", "B", "C", "D"]) + assert type(loaded_new.type().new_factor(dummy_network, "A", [])) == NewFactor loaded_newbis = pickle.loads(newbis_bytes) - assert loaded_newbis.variable() == "c" - assert set(loaded_newbis.evidence()) == set(["a", "b"]) + assert loaded_newbis.variable() == "C" + assert set(loaded_newbis.evidence()) == set(["A", "B"]) assert not loaded_newbis.fitted() assert type(loaded_newbis.type()) == NewType - nnbis = NewFactorBis("a", []) + nnbis = NewFactorBis("A", []) assert loaded_newbis.type() == nnbis.type() - assert type(loaded_newbis.type().new_factor(dummy_network, "a", [])) == NewFactorBis + assert type(loaded_newbis.type().new_factor(dummy_network, "A", [])) == NewFactorBis assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() @@ -166,7 +165,7 @@ def test_serialization_unfitted_factor( @pytest.fixture def lg_fitted_bytes(): - lg = LinearGaussianCPD("c", ["a", "b"], [1, 2, 3], 0.5) + lg = LinearGaussianCPD("C", ["A", "B"], [1, 2, 3], 0.5) return pickle.dumps(lg) @@ -174,22 +173,22 @@ def lg_fitted_bytes(): def ckde_fitted_bytes(): np.random.seed(1) data = pd.DataFrame( - {"a": np.random.rand(10), "b": np.random.rand(10), "c": np.random.rand(10)} + {"A": np.random.rand(10), "B": np.random.rand(10), "C": np.random.rand(10)} ).astype(float) - ckde = CKDE("c", ["a", "b"]) + ckde = CKDE("C", ["A", "B"]) ckde.fit(data) return pickle.dumps(ckde) @pytest.fixture def discrete_fitted_bytes(): - discrete = DiscreteFactor("c", ["a", "b"]) + discrete = DiscreteFactor("C", ["A", "B"]) data = pd.DataFrame( { - "a": ["a1", "a2", "a1", "a2", "a2", "a2", "a2", "a2"], - "b": ["b1", "b1", "b1", "b1", "b1", "b2", "b1", "b2"], - "c": ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"], + "A": ["A1", "A2", "A1", "A2", "A2", "A2", "A2", "A2"], + "B": ["B1", "B1", "B1", "B1", "B1", "B2", "B1", "B2"], + "C": ["C1", "C1", "C1", "C1", "C2", "C2", "C2", "C2"], }, dtype="category", ) @@ -199,14 +198,14 @@ def discrete_fitted_bytes(): @pytest.fixture def new_fitted_bytes(): - n = NewFactor("c", ["a", "b"]) + n = NewFactor("C", ["A", "B"]) n.fit(None) return pickle.dumps(n) @pytest.fixture def newbis_fitted_bytes(): - n = NewFactorBis("c", ["a", "b"]) + n = NewFactorBis("C", ["A", "B"]) n.fit(None) return pickle.dumps(n) @@ -219,35 +218,35 @@ def test_serialization_fitted_factor( newbis_fitted_bytes, ): loaded_lg = pickle.loads(lg_fitted_bytes) - assert loaded_lg.variable() == "c" - assert set(loaded_lg.evidence()) == set(["a", "b"]) + assert loaded_lg.variable() == "C" + assert set(loaded_lg.evidence()) == set(["A", "B"]) assert loaded_lg.fitted() assert list(loaded_lg.beta) == [1, 2, 3] assert loaded_lg.variance == 0.5 loaded_ckde = pickle.loads(ckde_fitted_bytes) - assert loaded_ckde.variable() == "c" - assert set(loaded_ckde.evidence()) == set(["a", "b"]) + assert loaded_ckde.variable() == "C" + assert set(loaded_ckde.evidence()) == set(["A", "B"]) assert loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() assert loaded_ckde.num_instances() == 10 tr = loaded_ckde.kde_joint().dataset().to_pandas() np.random.seed(1) - assert np.all(tr["a"] == np.random.rand(10)) - assert np.all(tr["b"] == np.random.rand(10)) - assert np.all(tr["c"] == np.random.rand(10)) + assert np.all(tr["A"] == np.random.rand(10)) + assert np.all(tr["B"] == np.random.rand(10)) + assert np.all(tr["C"] == np.random.rand(10)) loaded_discrete = pickle.loads(discrete_fitted_bytes) - assert loaded_discrete.variable() == "c" - assert set(loaded_discrete.evidence()) == set(["a", "b"]) + assert loaded_discrete.variable() == "C" + assert set(loaded_discrete.evidence()) == set(["A", "B"]) assert loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() test = pd.DataFrame( { - "a": ["a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2"], - "b": ["b1", "b1", "b2", "b2", "b1", "b1", "b2", "b2"], - "c": ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"], + "A": ["A1", "A2", "A1", "A2", "A1", "A2", "A1", "A2"], + "B": ["B1", "B1", "B2", "B2", "B1", "B1", "B2", "B2"], + "C": ["C1", "C1", "C1", "C1", "C2", "C2", "C2", "C2"], }, dtype="category", ) @@ -255,20 +254,20 @@ def test_serialization_fitted_factor( assert list(np.exp(ll)) == [1, 0.5, 0.5, 0, 0, 0.5, 0.5, 1] loaded_new = pickle.loads(new_fitted_bytes) - assert loaded_new.variable() == "c" - assert set(loaded_new.evidence()) == set(["a", "b"]) + assert loaded_new.variable() == "C" + assert set(loaded_new.evidence()) == set(["A", "B"]) assert loaded_new.fitted() assert type(loaded_new.type()) == NewType - nn = NewFactor("a", []) + nn = NewFactor("A", []) assert loaded_new.type() == nn.type() assert loaded_new.some_fit_data == "fitted" loaded_newbis = pickle.loads(newbis_fitted_bytes) - assert loaded_newbis.variable() == "c" - assert set(loaded_newbis.evidence()) == set(["a", "b"]) + assert loaded_newbis.variable() == "C" + assert set(loaded_newbis.evidence()) == set(["A", "B"]) assert loaded_newbis.fitted() assert isinstance(loaded_newbis.type(), NewType) - nn = NewFactorBis("a", []) + nn = NewFactorBis("A", []) assert loaded_newbis.type() == nn.type() assert loaded_newbis.some_fit_data == "fitted" assert isinstance(loaded_newbis.type(), type(loaded_new.type())) diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index 07ed37d9..694cc86d 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -1,10 +1,8 @@ import pickle import pyarrow as pa -import pytest -from util_test import generate_discrete_data_dependent, generate_normal_data_indep - import pybnesian as pbn +import pytest from pybnesian import ( CKDE, BayesianNetwork, @@ -17,29 +15,30 @@ LinearGaussianCPD, SemiparametricBN, ) +from util_test import generate_discrete_data, generate_normal_data_independent @pytest.fixture def gaussian_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) + gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(gaussian) @pytest.fixture def spbn_bytes(): - spbn = SemiparametricBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.CKDEType())]) + spbn = SemiparametricBN(["A", "B", "C", "D"], [("A", "B")], [("B", pbn.CKDEType())]) return pickle.dumps(spbn) @pytest.fixture def kde_bytes(): - kde = KDENetwork(["a", "b", "c", "d"], [("a", "b")]) + kde = KDENetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(kde) @pytest.fixture def discrete_bytes(): - discrete = DiscreteBN(["a", "b", "c", "d"], [("a", "b")]) + discrete = DiscreteBN(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(discrete) @@ -54,7 +53,7 @@ def default_node_type(self): return pbn.LinearGaussianCPDType() def can_have_arc(self, model, source, target): - return "a" in source + return "A" in source def new_bn(self, nodes): return NewBN(nodes) @@ -69,7 +68,7 @@ def __str__(self): @pytest.fixture def genericbn_bytes(): gen = BayesianNetwork( - MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], [("a", "b")] + MyRestrictedGaussianNetworkType(), ["A", "B", "C", "D"], [("A", "B")] ) return pickle.dumps(gen) @@ -86,7 +85,7 @@ def __init__(self, variables, arcs=None): @pytest.fixture def newbn_bytes(): - new = NewBN(["a", "b", "c", "d"], [("a", "b")]) + new = NewBN(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(new) @@ -142,12 +141,12 @@ def __setstate_extra__(self, t): @pytest.fixture def otherbn_bytes(): other = OtherBN( - ["a", "b", "c", "d"], - [("a", "b")], + ["A", "B", "C", "D"], + [("A", "B")], [ - ("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType()), + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), ], ) return pickle.dumps(other) @@ -163,50 +162,50 @@ def test_serialization_bn_model( otherbn_bytes, ): loaded_g = pickle.loads(gaussian_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_g.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_g.arcs() == [("A", "B")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(spbn_bytes) - assert set(loaded_s.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_s.arcs() == [("a", "b")] + assert set(loaded_s.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_s.arcs() == [("A", "B")] assert loaded_s.type() == pbn.SemiparametricBNType() assert loaded_s.node_types() == { - "a": pbn.UnknownFactorType(), - "b": pbn.CKDEType(), - "c": pbn.UnknownFactorType(), - "d": pbn.UnknownFactorType(), + "A": pbn.UnknownFactorType(), + "B": pbn.CKDEType(), + "C": pbn.UnknownFactorType(), + "D": pbn.UnknownFactorType(), } loaded_k = pickle.loads(kde_bytes) - assert set(loaded_k.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_k.arcs() == [("a", "b")] + assert set(loaded_k.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_k.arcs() == [("A", "B")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(discrete_bytes) - assert set(loaded_d.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_d.arcs() == [("a", "b")] + assert set(loaded_d.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_d.arcs() == [("A", "B")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(genericbn_bytes) - assert set(loaded_gen.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_gen.arcs() == [("a", "b")] + assert set(loaded_gen.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_gen.arcs() == [("A", "B")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(newbn_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_nn.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_nn.arcs() == [("A", "B")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(otherbn_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_o.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_o.arcs() == [("A", "B")] assert loaded_o.type() == NonHomogeneousType() assert loaded_o.node_types() == { - "a": pbn.UnknownFactorType(), - "b": pbn.LinearGaussianCPDType(), - "c": pbn.CKDEType(), - "d": pbn.DiscreteFactorType(), + "A": pbn.UnknownFactorType(), + "B": pbn.LinearGaussianCPDType(), + "C": pbn.CKDEType(), + "D": pbn.DiscreteFactorType(), } assert loaded_o.extra_info == "extra" @@ -215,8 +214,8 @@ def test_serialization_bn_model( @pytest.fixture def gaussian_partial_fit_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) - lg = pbn.LinearGaussianCPD("b", ["a"], [1, 2], 2) + gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -224,11 +223,11 @@ def gaussian_partial_fit_bytes(): @pytest.fixture def gaussian_fit_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) - lg_a = LinearGaussianCPD("a", [], [0], 0.5) - lg_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) - lg_c = LinearGaussianCPD("c", [], [2], 1) - lg_d = LinearGaussianCPD("d", [], [3], 1.5) + gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg_a = LinearGaussianCPD("A", [], [0], 0.5) + lg_b = LinearGaussianCPD("B", ["A"], [1, 2], 2) + lg_c = LinearGaussianCPD("C", [], [2], 1) + lg_d = LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_a, lg_b, lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -237,15 +236,15 @@ def gaussian_fit_bytes(): @pytest.fixture def other_partial_fit_bytes(): other = OtherBN( - ["a", "b", "c", "d"], - [("a", "b")], + ["A", "B", "C", "D"], + [("A", "B")], [ - ("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType()), + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), ], ) - lg = LinearGaussianCPD("b", ["a"], [1, 2], 2) + lg = LinearGaussianCPD("B", ["A"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) @@ -254,24 +253,23 @@ def other_partial_fit_bytes(): @pytest.fixture def other_fit_bytes(): other = OtherBN( - ["a", "b", "c", "d"], - [("a", "b")], + ["A", "B", "C", "D"], + [("A", "B")], [ - ("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType()), + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), ], ) - cpd_a = LinearGaussianCPD("a", [], [0], 0.5) - cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) + cpd_a = LinearGaussianCPD("A", [], [0], 0.5) + cpd_b = LinearGaussianCPD("B", ["A"], [1, 2], 2) - df_continuous = generate_normal_data_indep(100) - cpd_c = CKDE("c", []) + df_continuous = generate_normal_data_independent(100) + cpd_c = CKDE("C", []) cpd_c.fit(df_continuous) - df_discrete = generate_discrete_data_dependent(100) - df_discrete.columns = df_discrete.columns.str.lower() - cpd_d = DiscreteFactor("d", []) + df_discrete = generate_discrete_data(100) + cpd_d = DiscreteFactor("D", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) @@ -291,9 +289,9 @@ def test_serialization_fitted_bn( # #################### loaded_partial = pickle.loads(gaussian_partial_fit_bytes) assert not loaded_partial.fitted() - cpd = loaded_partial.cpd("b") - assert cpd.variable() == "b" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("B") + assert cpd.variable() == "B" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -303,26 +301,26 @@ def test_serialization_fitted_bn( loaded_fitted = pickle.loads(gaussian_fit_bytes) assert loaded_fitted.fitted() - cpd_a = loaded_fitted.cpd("a") - assert cpd_a.variable() == "a" + cpd_a = loaded_fitted.cpd("A") + assert cpd_a.variable() == "A" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 - cpd_b = loaded_fitted.cpd("b") - assert cpd_b.variable() == "b" - assert cpd_b.evidence() == ["a"] + cpd_b = loaded_fitted.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 - cpd_c = loaded_fitted.cpd("c") - assert cpd_c.variable() == "c" + cpd_c = loaded_fitted.cpd("C") + assert cpd_c.variable() == "C" assert cpd_c.evidence() == [] assert cpd_c.beta == [2] assert cpd_c.variance == 1 - cpd_d = loaded_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.beta == [3] assert cpd_d.variance == 1.5 @@ -332,9 +330,9 @@ def test_serialization_fitted_bn( # #################### loaded_other = pickle.loads(other_partial_fit_bytes) assert not loaded_other.fitted() - cpd = loaded_partial.cpd("b") - assert cpd.variable() == "b" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("B") + assert cpd.variable() == "B" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -344,29 +342,29 @@ def test_serialization_fitted_bn( loaded_other_fitted = pickle.loads(other_fit_bytes) assert loaded_other_fitted.fitted() - cpd_a = loaded_other_fitted.cpd("a") - assert cpd_a.variable() == "a" + cpd_a = loaded_other_fitted.cpd("A") + assert cpd_a.variable() == "A" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 assert cpd_a.type() == pbn.LinearGaussianCPDType() - cpd_b = loaded_other_fitted.cpd("b") - assert cpd_b.variable() == "b" - assert cpd_b.evidence() == ["a"] + cpd_b = loaded_other_fitted.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 assert cpd_b.type() == pbn.LinearGaussianCPDType() - cpd_c = loaded_other_fitted.cpd("c") - assert cpd_c.variable() == "c" + cpd_c = loaded_other_fitted.cpd("C") + assert cpd_c.variable() == "C" assert cpd_c.evidence() == [] assert cpd_c.fitted() assert cpd_c.num_instances() == 100 assert cpd_c.type() == pbn.CKDEType() - cpd_d = loaded_other_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_other_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.fitted() assert cpd_d.type() == pbn.DiscreteFactorType() @@ -379,34 +377,34 @@ def test_serialization_fitted_bn( @pytest.fixture def cond_gaussian_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(gaussian) @pytest.fixture def cond_spbn_bytes(): spbn = pbn.ConditionalSemiparametricBN( - ["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType())] + ["C", "D"], ["A", "B"], [("A", "C")], [("C", pbn.CKDEType())] ) return pickle.dumps(spbn) @pytest.fixture def cond_kde_bytes(): - kde = pbn.ConditionalKDENetwork(["c", "d"], ["a", "b"], [("a", "c")]) + kde = pbn.ConditionalKDENetwork(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(kde) @pytest.fixture def cond_discrete_bytes(): - discrete = pbn.ConditionalDiscreteBN(["c", "d"], ["a", "b"], [("a", "c")]) + discrete = pbn.ConditionalDiscreteBN(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(discrete) @pytest.fixture def cond_genericbn_bytes(): gen = ConditionalBayesianNetwork( - MyRestrictedGaussianNetworkType(), ["c", "d"], ["a", "b"], [("a", "c")] + MyRestrictedGaussianNetworkType(), ["C", "D"], ["A", "B"], [("A", "C")] ) return pickle.dumps(gen) @@ -425,7 +423,7 @@ def __init__(self, variables, interface, arcs=None): @pytest.fixture def cond_newbn_bytes(): - new = ConditionalNewBN(["c", "d"], ["a", "b"], [("a", "c")]) + new = ConditionalNewBN(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(new) @@ -462,13 +460,13 @@ def __setstate_extra__(self, t): @pytest.fixture def cond_otherbn_bytes(): other = ConditionalOtherBN( - ["c", "d"], - ["a", "b"], - [("a", "c")], + ["C", "D"], + ["A", "B"], + [("A", "C")], [ - ("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType()), + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), ], ) return pickle.dumps(other) @@ -486,48 +484,48 @@ def test_serialization_conditional_bn_model( otherbn_bytes, ): loaded_g = pickle.loads(cond_gaussian_bytes) - assert set(loaded_g.nodes()) == set(["c", "d"]) - assert set(loaded_g.interface_nodes()) == set(["a", "b"]) - assert loaded_g.arcs() == [("a", "c")] + assert set(loaded_g.nodes()) == set(["C", "D"]) + assert set(loaded_g.interface_nodes()) == set(["A", "B"]) + assert loaded_g.arcs() == [("A", "C")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(cond_spbn_bytes) - assert set(loaded_s.nodes()) == set(["c", "d"]) - assert set(loaded_s.interface_nodes()) == set(["a", "b"]) - assert loaded_s.arcs() == [("a", "c")] + assert set(loaded_s.nodes()) == set(["C", "D"]) + assert set(loaded_s.interface_nodes()) == set(["A", "B"]) + assert loaded_s.arcs() == [("A", "C")] assert loaded_s.type() == pbn.SemiparametricBNType() - assert loaded_s.node_types() == {"c": pbn.CKDEType(), "d": pbn.UnknownFactorType()} + assert loaded_s.node_types() == {"C": pbn.CKDEType(), "D": pbn.UnknownFactorType()} loaded_k = pickle.loads(cond_kde_bytes) - assert set(loaded_k.nodes()) == set(["c", "d"]) - assert set(loaded_k.interface_nodes()) == set(["a", "b"]) - assert loaded_k.arcs() == [("a", "c")] + assert set(loaded_k.nodes()) == set(["C", "D"]) + assert set(loaded_k.interface_nodes()) == set(["A", "B"]) + assert loaded_k.arcs() == [("A", "C")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(cond_discrete_bytes) - assert set(loaded_d.nodes()) == set(["c", "d"]) - assert set(loaded_d.interface_nodes()) == set(["a", "b"]) - assert loaded_d.arcs() == [("a", "c")] + assert set(loaded_d.nodes()) == set(["C", "D"]) + assert set(loaded_d.interface_nodes()) == set(["A", "B"]) + assert loaded_d.arcs() == [("A", "C")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(cond_genericbn_bytes) - assert set(loaded_gen.nodes()) == set(["c", "d"]) - assert set(loaded_gen.interface_nodes()) == set(["a", "b"]) - assert loaded_gen.arcs() == [("a", "c")] + assert set(loaded_gen.nodes()) == set(["C", "D"]) + assert set(loaded_gen.interface_nodes()) == set(["A", "B"]) + assert loaded_gen.arcs() == [("A", "C")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(cond_newbn_bytes) - assert set(loaded_nn.nodes()) == set(["c", "d"]) - assert set(loaded_nn.interface_nodes()) == set(["a", "b"]) - assert loaded_nn.arcs() == [("a", "c")] + assert set(loaded_nn.nodes()) == set(["C", "D"]) + assert set(loaded_nn.interface_nodes()) == set(["A", "B"]) + assert loaded_nn.arcs() == [("A", "C")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(cond_otherbn_bytes) - assert set(loaded_o.nodes()) == set(["c", "d"]) - assert set(loaded_o.interface_nodes()) == set(["a", "b"]) - assert loaded_o.arcs() == [("a", "c")] + assert set(loaded_o.nodes()) == set(["C", "D"]) + assert set(loaded_o.interface_nodes()) == set(["A", "B"]) + assert loaded_o.arcs() == [("A", "C")] assert loaded_o.type() == NonHomogeneousType() - assert loaded_o.node_types() == {"c": pbn.CKDEType(), "d": pbn.DiscreteFactorType()} + assert loaded_o.node_types() == {"C": pbn.CKDEType(), "D": pbn.DiscreteFactorType()} assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type() @@ -541,8 +539,8 @@ def test_serialization_conditional_bn_model( @pytest.fixture def cond_gaussian_partial_fit_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) - lg = LinearGaussianCPD("c", ["a"], [1, 2], 2) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) + lg = LinearGaussianCPD("C", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -550,9 +548,9 @@ def cond_gaussian_partial_fit_bytes(): @pytest.fixture def cond_gaussian_fit_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) - lg_c = LinearGaussianCPD("c", ["a"], [1, 2], 2) - lg_d = LinearGaussianCPD("d", [], [3], 1.5) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) + lg_c = LinearGaussianCPD("C", ["A"], [1, 2], 2) + lg_d = LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -561,12 +559,12 @@ def cond_gaussian_fit_bytes(): @pytest.fixture def cond_other_partial_fit_bytes(): other = ConditionalOtherBN( - ["c", "d"], - ["a", "b"], - [("a", "c")], - [("c", pbn.CKDEType()), ("d", pbn.LinearGaussianCPDType())], + ["C", "D"], + ["A", "B"], + [("A", "C")], + [("C", pbn.CKDEType()), ("D", pbn.LinearGaussianCPDType())], ) - lg = LinearGaussianCPD("d", [], [3], 1.5) + lg = LinearGaussianCPD("D", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) @@ -575,20 +573,19 @@ def cond_other_partial_fit_bytes(): @pytest.fixture def cond_other_fit_bytes(): other = ConditionalOtherBN( - ["c", "d"], - ["a", "b"], - [("a", "c")], - [("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())], + ["C", "D"], + ["A", "B"], + [("A", "C")], + [("C", pbn.CKDEType()), ("D", pbn.DiscreteFactorType())], ) - cpd_c = CKDE("c", ["a"]) - cpd_d = DiscreteFactor("d", []) + cpd_c = CKDE("C", ["A"]) + cpd_d = DiscreteFactor("D", []) - df_continuous = generate_normal_data_indep(100) + df_continuous = generate_normal_data_independent(100) cpd_c.fit(df_continuous) - df_discrete = generate_discrete_data_dependent(100) - df_discrete.columns = df_discrete.columns.str.lower() - cpd_d = DiscreteFactor("d", []) + df_discrete = generate_discrete_data(100) + cpd_d = DiscreteFactor("D", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_c, cpd_d]) @@ -608,9 +605,9 @@ def test_serialization_fitted_conditional_bn( # #################### loaded_partial = pickle.loads(cond_gaussian_partial_fit_bytes) assert not loaded_partial.fitted() - cpd = loaded_partial.cpd("c") - assert cpd.variable() == "c" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("C") + assert cpd.variable() == "C" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -620,14 +617,14 @@ def test_serialization_fitted_conditional_bn( loaded_fitted = pickle.loads(cond_gaussian_fit_bytes) assert loaded_fitted.fitted() - cpd_c = loaded_fitted.cpd("c") - assert cpd_c.variable() == "c" - assert cpd_c.evidence() == ["a"] + cpd_c = loaded_fitted.cpd("C") + assert cpd_c.variable() == "C" + assert cpd_c.evidence() == ["A"] assert list(cpd_c.beta) == [1, 2] assert cpd_c.variance == 2 - cpd_d = loaded_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.beta == [3] assert cpd_d.variance == 1.5 @@ -637,8 +634,8 @@ def test_serialization_fitted_conditional_bn( # #################### loaded_other = pickle.loads(cond_other_partial_fit_bytes) assert not loaded_other.fitted() - cpd = loaded_other.cpd("d") - assert cpd.variable() == "d" + cpd = loaded_other.cpd("D") + assert cpd.variable() == "D" assert cpd.evidence() == [] assert cpd.beta == [3] assert cpd.variance == 1.5 @@ -649,15 +646,15 @@ def test_serialization_fitted_conditional_bn( loaded_other_fitted = pickle.loads(cond_other_fit_bytes) assert loaded_other_fitted.fitted() - cpd_c = loaded_other_fitted.cpd("c") - assert cpd_c.variable() == "c" - assert cpd_c.evidence() == ["a"] + cpd_c = loaded_other_fitted.cpd("C") + assert cpd_c.variable() == "C" + assert cpd_c.evidence() == ["A"] assert cpd_c.fitted() assert cpd_c.num_instances() == 100 assert cpd_c.type() == pbn.CKDEType() - cpd_d = loaded_other_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_other_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.fitted() assert cpd_d.type() == pbn.DiscreteFactorType() @@ -673,44 +670,44 @@ def test_serialization_fitted_conditional_bn( @pytest.fixture def dyn_gaussian_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(gaussian) @pytest.fixture def dyn_spbn_bytes(): - spbn = pbn.DynamicSemiparametricBN(["a", "b", "c", "d"], 2) - spbn.static_bn().add_arc("a_t_2", "d_t_1") - spbn.transition_bn().add_arc("c_t_2", "b_t_0") - spbn.transition_bn().set_node_type("b_t_0", pbn.CKDEType()) + spbn = pbn.DynamicSemiparametricBN(["A", "B", "C", "D"], 2) + spbn.static_bn().add_arc("A_t_2", "D_t_1") + spbn.transition_bn().add_arc("C_t_2", "B_t_0") + spbn.transition_bn().set_node_type("B_t_0", pbn.CKDEType()) return pickle.dumps(spbn) @pytest.fixture def dyn_kde_bytes(): - kde = pbn.DynamicKDENetwork(["a", "b", "c", "d"], 2) - kde.static_bn().add_arc("a_t_2", "d_t_1") - kde.transition_bn().add_arc("c_t_2", "b_t_0") + kde = pbn.DynamicKDENetwork(["A", "B", "C", "D"], 2) + kde.static_bn().add_arc("A_t_2", "D_t_1") + kde.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(kde) @pytest.fixture def dyn_discrete_bytes(): - discrete = pbn.DynamicDiscreteBN(["a", "b", "c", "d"], 2) - discrete.static_bn().add_arc("a_t_2", "d_t_1") - discrete.transition_bn().add_arc("c_t_2", "b_t_0") + discrete = pbn.DynamicDiscreteBN(["A", "B", "C", "D"], 2) + discrete.static_bn().add_arc("A_t_2", "D_t_1") + discrete.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(discrete) @pytest.fixture def dyn_genericbn_bytes(): gen = pbn.DynamicBayesianNetwork( - MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], 2 + MyRestrictedGaussianNetworkType(), ["A", "B", "C", "D"], 2 ) - gen.static_bn().add_arc("a_t_2", "d_t_1") - gen.transition_bn().add_arc("a_t_2", "b_t_0") + gen.static_bn().add_arc("A_t_2", "D_t_1") + gen.transition_bn().add_arc("A_t_2", "B_t_0") return pickle.dumps(gen) @@ -742,21 +739,21 @@ def __setstate_extra__(self, t): @pytest.fixture def dyn_newbn_bytes(): - new = DynamicNewBN(["a", "b", "c", "d"], 2) - new.static_bn().add_arc("a_t_2", "d_t_1") - new.transition_bn().add_arc("a_t_2", "b_t_0") + new = DynamicNewBN(["A", "B", "C", "D"], 2) + new.static_bn().add_arc("A_t_2", "D_t_1") + new.transition_bn().add_arc("A_t_2", "B_t_0") return pickle.dumps(new) @pytest.fixture def dyn_otherbn_bytes(): - other = DynamicOtherBN(["a", "b", "c", "d"], 2) - other.static_bn().add_arc("a_t_2", "d_t_1") - other.static_bn().set_node_type("c_t_1", pbn.DiscreteFactorType()) - other.static_bn().set_node_type("d_t_1", pbn.CKDEType()) + other = DynamicOtherBN(["A", "B", "C", "D"], 2) + other.static_bn().add_arc("A_t_2", "D_t_1") + other.static_bn().set_node_type("C_t_1", pbn.DiscreteFactorType()) + other.static_bn().set_node_type("D_t_1", pbn.CKDEType()) - other.transition_bn().add_arc("a_t_2", "b_t_0") - other.transition_bn().set_node_type("d_t_0", pbn.CKDEType()) + other.transition_bn().add_arc("A_t_2", "B_t_0") + other.transition_bn().set_node_type("D_t_0", pbn.CKDEType()) return pickle.dumps(other) @@ -770,64 +767,64 @@ def test_serialization_dbn_model( dyn_otherbn_bytes, ): loaded_g = pickle.loads(dyn_gaussian_bytes) - assert set(loaded_g.variables()) == set(["a", "b", "c", "d"]) - assert loaded_g.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_g.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_g.variables()) == set(["A", "B", "C", "D"]) + assert loaded_g.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_g.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(dyn_spbn_bytes) - assert set(loaded_s.variables()) == set(["a", "b", "c", "d"]) - assert loaded_s.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_s.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_s.variables()) == set(["A", "B", "C", "D"]) + assert loaded_s.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_s.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_s.type() == pbn.SemiparametricBNType() node_types = {v + "_t_0": pbn.UnknownFactorType() for v in loaded_s.variables()} - node_types["b_t_0"] = pbn.CKDEType() + node_types["B_t_0"] = pbn.CKDEType() assert loaded_s.transition_bn().node_types() == node_types loaded_k = pickle.loads(dyn_kde_bytes) - assert set(loaded_k.variables()) == set(["a", "b", "c", "d"]) - assert loaded_k.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_k.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_k.variables()) == set(["A", "B", "C", "D"]) + assert loaded_k.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_k.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(dyn_discrete_bytes) - assert set(loaded_d.variables()) == set(["a", "b", "c", "d"]) - assert loaded_d.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_d.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_d.variables()) == set(["A", "B", "C", "D"]) + assert loaded_d.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_d.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(dyn_genericbn_bytes) - assert set(loaded_gen.variables()) == set(["a", "b", "c", "d"]) - assert loaded_gen.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_gen.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_gen.variables()) == set(["A", "B", "C", "D"]) + assert loaded_gen.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_gen.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(dyn_newbn_bytes) - assert set(loaded_nn.variables()) == set(["a", "b", "c", "d"]) - assert loaded_nn.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_nn.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_nn.variables()) == set(["A", "B", "C", "D"]) + assert loaded_nn.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_nn.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_other = pickle.loads(dyn_otherbn_bytes) - assert set(loaded_other.variables()) == set(["a", "b", "c", "d"]) - assert loaded_other.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_other.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_other.variables()) == set(["A", "B", "C", "D"]) + assert loaded_other.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_other.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_other.type() == NonHomogeneousType() assert loaded_other.extra_info == "extra" - assert loaded_other.static_bn().node_type("c_t_1") == pbn.DiscreteFactorType() - assert loaded_other.static_bn().node_type("d_t_1") == pbn.CKDEType() - assert loaded_other.transition_bn().node_type("d_t_0") == pbn.CKDEType() + assert loaded_other.static_bn().node_type("C_t_1") == pbn.DiscreteFactorType() + assert loaded_other.static_bn().node_type("D_t_1") == pbn.CKDEType() + assert loaded_other.transition_bn().node_type("D_t_0") == pbn.CKDEType() @pytest.fixture def dyn_gaussian_partial_fit_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") + lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) gaussian.static_bn().add_cpds([lg]) - lg = LinearGaussianCPD("b_t_0", ["c_t_2"], [3, 4], 5) + lg = LinearGaussianCPD("B_t_0", ["C_t_2"], [3, 4], 5) gaussian.transition_bn().add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -835,10 +832,10 @@ def dyn_gaussian_partial_fit_bytes(): @pytest.fixture def dyn_gaussian_fit_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") - df = generate_normal_data_indep(1000) + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") + df = generate_normal_data_independent(1000) gaussian.fit(df) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -846,33 +843,33 @@ def dyn_gaussian_fit_bytes(): @pytest.fixture def dyn_other_partial_fit_bytes(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN( static_nodes, - [("a_t_2", "d_t_1")], + [("A_t_2", "D_t_1")], [ - ("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType()), + ("B_t_1", pbn.DiscreteFactorType()), + ("C_t_1", pbn.CKDEType()), + ("D_t_1", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, - [("a_t_2", "d_t_0")], + [("A_t_2", "D_t_0")], [ - ("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType()), + ("B_t_0", pbn.DiscreteFactorType()), + ("C_t_0", pbn.CKDEType()), + ("D_t_0", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) + lg = LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() @@ -884,43 +881,43 @@ def dyn_other_partial_fit_bytes(): @pytest.fixture def dyn_other_fit_bytes(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN( static_nodes, - [("a_t_2", "d_t_1")], + [("A_t_2", "D_t_1")], [ - ("b_t_2", pbn.DiscreteFactorType()), - ("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType()), + ("B_t_2", pbn.DiscreteFactorType()), + ("B_t_1", pbn.DiscreteFactorType()), + ("C_t_1", pbn.CKDEType()), + ("D_t_1", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, - [("a_t_2", "d_t_0")], + [("A_t_2", "D_t_0")], [ - ("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType()), + ("B_t_0", pbn.DiscreteFactorType()), + ("C_t_0", pbn.CKDEType()), + ("D_t_0", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) + lg = LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) - df_continuous = generate_normal_data_indep(1000) - df_discrete = generate_discrete_data_dependent(1000) + df_continuous = generate_normal_data_independent(1000) + df_discrete = generate_discrete_data(1000) df = df_continuous - df["b"] = df_discrete["B"] + df["B"] = df_discrete["B"] dyn_other.fit(df) dyn_other.include_cpd = True return pickle.dumps(dyn_other) @@ -939,15 +936,15 @@ def test_serialization_fitted_dbn( assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("b_t_0") - assert cpd.variable() == "b_t_0" - assert cpd.evidence() == ["c_t_2"] + cpd = loaded_partial.transition_bn().cpd("B_t_0") + assert cpd.variable() == "B_t_0" + assert cpd.evidence() == ["C_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 5 @@ -966,25 +963,25 @@ def test_serialization_fitted_dbn( assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() - assert loaded_partial.static_bn().node_type("b_t_1") == pbn.DiscreteFactorType() - assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() - assert loaded_partial.static_bn().node_type("d_t_1") == pbn.LinearGaussianCPDType() + assert loaded_partial.static_bn().node_type("B_t_1") == pbn.DiscreteFactorType() + assert loaded_partial.static_bn().node_type("C_t_1") == pbn.CKDEType() + assert loaded_partial.static_bn().node_type("D_t_1") == pbn.LinearGaussianCPDType() - assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() - assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() + assert loaded_partial.transition_bn().node_type("B_t_0") == pbn.DiscreteFactorType() + assert loaded_partial.transition_bn().node_type("C_t_0") == pbn.CKDEType() assert ( - loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + loaded_partial.transition_bn().node_type("D_t_0") == pbn.LinearGaussianCPDType() ) - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("d_t_0") - assert cpd.variable() == "d_t_0" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.transition_bn().cpd("D_t_0") + assert cpd.variable() == "D_t_0" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 @@ -995,24 +992,24 @@ def test_serialization_fitted_dbn( assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() - assert loaded_partial.static_bn().node_type("b_t_1") == pbn.DiscreteFactorType() - assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() - assert loaded_partial.static_bn().node_type("d_t_1") == pbn.LinearGaussianCPDType() + assert loaded_partial.static_bn().node_type("B_t_1") == pbn.DiscreteFactorType() + assert loaded_partial.static_bn().node_type("C_t_1") == pbn.CKDEType() + assert loaded_partial.static_bn().node_type("D_t_1") == pbn.LinearGaussianCPDType() - assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() - assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() + assert loaded_partial.transition_bn().node_type("B_t_0") == pbn.DiscreteFactorType() + assert loaded_partial.transition_bn().node_type("C_t_0") == pbn.CKDEType() assert ( - loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() + loaded_partial.transition_bn().node_type("D_t_0") == pbn.LinearGaussianCPDType() ) - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("d_t_0") - assert cpd.variable() == "d_t_0" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.transition_bn().cpd("D_t_0") + assert cpd.variable() == "D_t_0" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 From 1213cdb90481fa0d75441d8b0b602d4bb7b8e095 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 25 Nov 2024 11:47:02 +0000 Subject: [PATCH 34/75] util_test -> data --- tests/dataset/crossvalidation_test.py | 3 ++- tests/dataset/holdout_test.py | 3 ++- tests/factors/continuous/CKDE_test.py | 3 ++- tests/factors/continuous/KDE_test.py | 3 ++- tests/factors/continuous/LinearGaussianCPD_test.py | 3 ++- tests/factors/continuous/ProductKDE_test.py | 3 ++- tests/factors/discrete/DiscreteFactor_test.py | 3 ++- tests/helpers/{util_test.py => data.py} | 1 - tests/learning/algorithms/hillclimbing_test.py | 3 ++- tests/learning/operators/operatorpool_test.py | 3 ++- tests/learning/operators/operatorset_test.py | 3 ++- tests/learning/parameters/mle_test.py | 3 ++- tests/learning/scores/bic_test.py | 3 ++- tests/learning/scores/cvlikelihood_test.py | 3 ++- tests/learning/scores/holdoutlikelihood_test.py | 3 ++- tests/models/BayesianNetwork_test.py | 3 ++- tests/models/BayesianNetwork_type_test.py | 3 ++- tests/models/DynamicBayesianNetwork_test.py | 3 ++- tests/models/SemiparametricBN_test.py | 3 ++- tests/serialization/serialize_models_test.py | 3 ++- 20 files changed, 38 insertions(+), 20 deletions(-) rename tests/helpers/{util_test.py => data.py} (99%) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index afff230f..ee42fd84 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,6 +1,7 @@ import numpy as np import pybnesian as pbn -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 773a5126..546fae38 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,7 +1,8 @@ import numpy as np import pandas as pd import pybnesian as pbn -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index e16fd207..ef9281a8 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -6,7 +6,8 @@ from scipy.stats import gaussian_kde from scipy.stats import multivariate_normal as mvn from scipy.stats import norm -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 SMALL_SIZE = 10 diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index 70629ed2..c7439543 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -4,7 +4,8 @@ import pytest from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 500 df = generate_normal_data(SIZE, seed=0) diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index eec2b982..65e8c1df 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -3,7 +3,8 @@ import pyarrow as pa import pybnesian as pbn from scipy.stats import norm -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 847a8891..766317ff 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -4,7 +4,8 @@ import pytest from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 500 df = generate_normal_data(SIZE, seed=0) diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 798e3772..a1c6bac6 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -3,7 +3,8 @@ import pyarrow as pa import pybnesian as pbn import pytest -from util_test import generate_discrete_data + +from data import generate_discrete_data df = generate_discrete_data(10000) diff --git a/tests/helpers/util_test.py b/tests/helpers/data.py similarity index 99% rename from tests/helpers/util_test.py rename to tests/helpers/data.py index d8ba0d12..1dabf4e6 100644 --- a/tests/helpers/util_test.py +++ b/tests/helpers/data.py @@ -6,7 +6,6 @@ SEED = 0 -# TODO: Copy to pybnesian def generate_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of normally distributed data with linear Gaussian relationships. The relationships are as follows: diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 537a8f45..d554e225 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,7 +1,8 @@ import numpy as np import pybnesian as pbn from pybnesian import BayesianNetwork, BayesianNetworkType -from util_test import generate_normal_data + +from data import generate_normal_data df = generate_normal_data(1000) # TODO: Add tests for normal data with dependencies diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 9b74e71f..c6febebd 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,6 +1,7 @@ import pybnesian as pbn import pytest -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index 768961d7..9d0d600f 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,7 +1,8 @@ import numpy as np import pybnesian as pbn import pytest -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 74d25076..fb672f1d 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,7 +1,8 @@ import numpy as np import pybnesian as pbn import pytest -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 28f38927..44196a20 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,7 +1,8 @@ import numpy as np import pybnesian as pbn from scipy.stats import norm -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 10000 diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index 99eebbd0..e3b25c28 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -3,7 +3,8 @@ import pybnesian as pbn import pytest from scipy.stats import gaussian_kde, norm -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 1000 df = generate_normal_data(SIZE) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index dc2ae24f..08d92319 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -3,7 +3,8 @@ import pybnesian as pbn import pytest from scipy.stats import gaussian_kde, norm -from util_test import generate_normal_data + +from data import generate_normal_data SIZE = 1000 df = generate_normal_data(SIZE) diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index b28d187a..1420850c 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -2,7 +2,8 @@ import pybnesian as pbn import pytest from pybnesian import BayesianNetwork, GaussianNetwork -from util_test import generate_normal_data + +from data import generate_normal_data df = generate_normal_data(10000) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 678651a2..5d8bc01a 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -8,7 +8,8 @@ KDENetwork, SemiparametricBN, ) -from util_test import generate_normal_data_independent + +from data import generate_normal_data_independent def test_bn_type(): diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 786103a5..9c849b1b 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -10,7 +10,8 @@ GaussianNetwork, ) from scipy.stats import norm -from util_test import generate_normal_data + +from data import generate_normal_data df = generate_normal_data(1000) diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 56c4d16b..0045d560 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -2,7 +2,8 @@ import pybnesian as pbn import pytest from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN -from util_test import generate_normal_data + +from data import generate_normal_data df = generate_normal_data(10000) diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index 694cc86d..2f39d9ce 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -15,7 +15,8 @@ LinearGaussianCPD, SemiparametricBN, ) -from util_test import generate_discrete_data, generate_normal_data_independent + +from data import generate_discrete_data, generate_normal_data_independent @pytest.fixture From 0a31a17e60301b247904fff6bc16efcf5f8d80b0 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Mon, 25 Nov 2024 11:47:08 +0000 Subject: [PATCH 35/75] pytest -s --- pytest.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 99a3c717..b9c99fa9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] testpaths = tests -norecursedirs = tests/helpers \ No newline at end of file +norecursedirs = tests/helpers +addopts = -s \ No newline at end of file From 1d01b698e91543edd68c6d48ef4233216cee56f4 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 26 Nov 2024 08:14:34 +0000 Subject: [PATCH 36/75] data.py reviewed --- tests/helpers/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index 1dabf4e6..c68e1567 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -384,7 +384,7 @@ def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataF # Initialization np.random.seed(seed) - class_dict = np.asarray(["Class1", "Class2", "Class3"]) + class_dict = np.asarray(["class1", "class2", "class3"]) class_values = class_dict[ np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) ] @@ -395,9 +395,9 @@ def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataF c_values = np.empty_like(a_values) # Indices - class1_indices = class_values == "Class1" - class2_indices = class_values == "Class2" - class3_indices = class_values == "Class3" + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" # Sampling # b_values based on class_values From c6021520d3a51112c1f740cc5dadf9d33f6e270b Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 28 Nov 2024 21:33:21 +0000 Subject: [PATCH 37/75] linear_correlation and mutual_info began --- .../independence_tests/independence_test.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/learning/independence_tests/independence_test.py diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py new file mode 100644 index 00000000..7d33f5b9 --- /dev/null +++ b/tests/learning/independence_tests/independence_test.py @@ -0,0 +1,58 @@ +import itertools + +import numpy as np +import pandas as pd +from pybnesian import KMutualInformation, LinearCorrelation, MutualInformation +from scipy.stats import pearsonr +from sklearn.feature_selection import mutual_info_regression + +from data import generate_normal_data, generate_normal_data_independent + +SIZE = 10000 +SEED = 0 +data = generate_normal_data(SIZE, SEED) + + +def test_linear_correlation(): + df = generate_normal_data(SIZE, SEED)[["A", "B"]] + + correlations = {} + columns = df.columns.tolist() + for col_a, col_b in itertools.combinations(columns, 2): + correlations[col_a + "__" + col_b] = pearsonr( + df.loc[:, col_a], df.loc[:, col_b] + ) + result = pd.DataFrame.from_dict(correlations, orient="index") + result.columns = ["PCC", "p-value"] + + linear_correlation_pvalue = LinearCorrelation(df).pvalue("A", "B") + np.testing.assert_allclose( + np.array([result.loc["A__B", "PCC"]]), + np.array([df.corr().loc["A", "B"]]), + rtol=1e-5, + atol=1e-8, + ) + np.testing.assert_allclose( + np.array([linear_correlation_pvalue]), + np.array([result.loc["A__B", "p-value"]]), + rtol=1e-5, + atol=1e-8, + ) + + +def test_mutual_info(): + n_neighbors = 3 + mutual_info = MutualInformation(data).mi("A", "B") + k_mutual_info = KMutualInformation(data, k=n_neighbors).mi("A", "B") + sklearn_mutual_info = mutual_info_regression( + data[["A"]], data["B"], n_neighbors=n_neighbors + ) + # print("\n", sklearn_mutual_info[0]) + # print(mutual_info) + # print(k_mutual_info) + # np.testing.assert_allclose( + # sklearn_mutual_info, + # np.array([k_mutual_info]), + # rtol=1e-5, + # atol=1e-8, + # ) From 6bdf41ef0f02ec1bf15e17f8c6a0ec18f27db41b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Casaj=C3=BAs=20Seti=C3=A9n?= Date: Tue, 3 Dec 2024 09:40:34 +0000 Subject: [PATCH 38/75] independence_tests done --- .../independence_tests/independence_test.py | 92 +++++++++++++++---- 1 file changed, 74 insertions(+), 18 deletions(-) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 7d33f5b9..467dbf3f 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -2,57 +2,113 @@ import numpy as np import pandas as pd -from pybnesian import KMutualInformation, LinearCorrelation, MutualInformation +from pybnesian import KMutualInformation, LinearCorrelation, MutualInformation, RCoT from scipy.stats import pearsonr -from sklearn.feature_selection import mutual_info_regression from data import generate_normal_data, generate_normal_data_independent +# from sklearn.feature_selection import mutual_info_regression + + SIZE = 10000 SEED = 0 data = generate_normal_data(SIZE, SEED) +data_independent = generate_normal_data_independent(SIZE, SEED) +# RFE: Test true and false independence def test_linear_correlation(): - df = generate_normal_data(SIZE, SEED)[["A", "B"]] + df = data[["A", "B"]] + independent_df = data_independent[["A", "B"]] + + # Pybnesian Linear correlation + linear_correlation = LinearCorrelation(df) + independent_linear_correlation = LinearCorrelation(independent_df) + pvalue = linear_correlation.pvalue("A", "B") + independent_pvalue = independent_linear_correlation.pvalue("A", "B") + # scipy pearsonr correlation correlations = {} columns = df.columns.tolist() for col_a, col_b in itertools.combinations(columns, 2): correlations[col_a + "__" + col_b] = pearsonr( df.loc[:, col_a], df.loc[:, col_b] ) - result = pd.DataFrame.from_dict(correlations, orient="index") - result.columns = ["PCC", "p-value"] + result = pd.DataFrame.from_dict( + correlations, orient="index", columns=["PCC", "p-value"] + ) - linear_correlation_pvalue = LinearCorrelation(df).pvalue("A", "B") + # Compare correlation values np.testing.assert_allclose( - np.array([result.loc["A__B", "PCC"]]), np.array([df.corr().loc["A", "B"]]), + np.array([result.loc["A__B", "PCC"]]), rtol=1e-5, atol=1e-8, ) + # Compare p-values np.testing.assert_allclose( - np.array([linear_correlation_pvalue]), + np.array([pvalue]), np.array([result.loc["A__B", "p-value"]]), rtol=1e-5, atol=1e-8, ) + # Check whether the p-values are below the significance level + assert pvalue < 0.05 + assert independent_pvalue > 0.05 def test_mutual_info(): + mutual_info = MutualInformation(data) + independent_mutual_info = MutualInformation(data_independent) + + # Check whether the mutual information is higher when the variables are dependent + mutual_info_value = mutual_info.mi("A", "B") + independent_mutual_info_value = independent_mutual_info.mi("A", "B") + assert mutual_info_value > independent_mutual_info_value + + # Check whether the p-values are below the significance level + pvalue = mutual_info.pvalue("A", "B") + independent_pvalue = independent_mutual_info.pvalue("A", "B") + assert pvalue < 0.05 + assert independent_pvalue > 0.05 + + +def test_k_mutual_info(): n_neighbors = 3 - mutual_info = MutualInformation(data).mi("A", "B") - k_mutual_info = KMutualInformation(data, k=n_neighbors).mi("A", "B") - sklearn_mutual_info = mutual_info_regression( - data[["A"]], data["B"], n_neighbors=n_neighbors - ) - # print("\n", sklearn_mutual_info[0]) - # print(mutual_info) - # print(k_mutual_info) + k_mutual_info = KMutualInformation(data, k=n_neighbors) + independent_k_mutual_info = KMutualInformation(data_independent, k=n_neighbors) + + # Check whether the mutual information is higher when the variables are dependent + k_mutual_info_value = k_mutual_info.mi("A", "B") + independent_k_mutual_info_value = independent_k_mutual_info.mi("A", "B") + assert k_mutual_info_value > independent_k_mutual_info_value + + # Check whether the p-values are below the significance level + pvalue = k_mutual_info.pvalue("A", "B") + independent_pvalue = independent_k_mutual_info.pvalue("A", "B") + assert pvalue < 0.05 + assert independent_pvalue > 0.05 + + # RFE: Results vary with scikit-learn, why? + # sklearn_k_mutual_info_value = mutual_info_regression( + # data[["A"]], data["B"], n_neighbors=n_neighbors + # )[0] + # print(k_mutual_info_value) + # print("\n", sklearn_k_mutual_info_value) # np.testing.assert_allclose( - # sklearn_mutual_info, - # np.array([k_mutual_info]), + # sklearn_k_mutual_info_value, + # np.array([k_mutual_info_value]), # rtol=1e-5, # atol=1e-8, # ) + + +def test_rcot(): + rcot = RCoT(data, random_fourier_xy=5, random_fourier_z=100) + independent_rcot = RCoT(data_independent, random_fourier_xy=5, random_fourier_z=100) + + p_value = rcot.pvalue("A", "B") + independent_p_value = independent_rcot.pvalue("A", "B") + + assert p_value < 0.05 + assert independent_p_value > 0.05 From 2ba50b47a7f1620ee8e16c1208d94150944fbc06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Casaj=C3=BAs=20Seti=C3=A9n?= Date: Tue, 3 Dec 2024 09:50:09 +0000 Subject: [PATCH 39/75] chi square test --- .../independence_tests/independence_test.py | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 467dbf3f..ba02e64f 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -2,10 +2,21 @@ import numpy as np import pandas as pd -from pybnesian import KMutualInformation, LinearCorrelation, MutualInformation, RCoT +from pybnesian import ( + ChiSquare, + KMutualInformation, + LinearCorrelation, + MutualInformation, + RCoT, +) from scipy.stats import pearsonr -from data import generate_normal_data, generate_normal_data_independent +from data import ( + generate_discrete_data, + generate_discrete_data_independent, + generate_normal_data, + generate_normal_data_independent, +) # from sklearn.feature_selection import mutual_info_regression @@ -13,13 +24,28 @@ SIZE = 10000 SEED = 0 data = generate_normal_data(SIZE, SEED) -data_independent = generate_normal_data_independent(SIZE, SEED) +independent_data = generate_normal_data_independent(SIZE, SEED) + +discrete_data = generate_discrete_data(SIZE, SEED) +independent_discrete_data = generate_discrete_data_independent(SIZE, SEED) + + +def test_chi_square(): + chi_square = ChiSquare(discrete_data) + independent_chi_square = ChiSquare(independent_discrete_data) + + p_value = chi_square.pvalue("A", "B") + independent_p_value = independent_chi_square.pvalue("A", "B") + + # Check whether the p-values are below the significance level + assert p_value < 0.05 + assert independent_p_value > 0.05 # RFE: Test true and false independence def test_linear_correlation(): df = data[["A", "B"]] - independent_df = data_independent[["A", "B"]] + independent_df = independent_data[["A", "B"]] # Pybnesian Linear correlation linear_correlation = LinearCorrelation(df) @@ -52,6 +78,7 @@ def test_linear_correlation(): rtol=1e-5, atol=1e-8, ) + # Check whether the p-values are below the significance level assert pvalue < 0.05 assert independent_pvalue > 0.05 @@ -59,7 +86,7 @@ def test_linear_correlation(): def test_mutual_info(): mutual_info = MutualInformation(data) - independent_mutual_info = MutualInformation(data_independent) + independent_mutual_info = MutualInformation(independent_data) # Check whether the mutual information is higher when the variables are dependent mutual_info_value = mutual_info.mi("A", "B") @@ -76,7 +103,7 @@ def test_mutual_info(): def test_k_mutual_info(): n_neighbors = 3 k_mutual_info = KMutualInformation(data, k=n_neighbors) - independent_k_mutual_info = KMutualInformation(data_independent, k=n_neighbors) + independent_k_mutual_info = KMutualInformation(independent_data, k=n_neighbors) # Check whether the mutual information is higher when the variables are dependent k_mutual_info_value = k_mutual_info.mi("A", "B") @@ -84,6 +111,7 @@ def test_k_mutual_info(): assert k_mutual_info_value > independent_k_mutual_info_value # Check whether the p-values are below the significance level + # NOTE: Slow execution pvalue = k_mutual_info.pvalue("A", "B") independent_pvalue = independent_k_mutual_info.pvalue("A", "B") assert pvalue < 0.05 @@ -105,10 +133,10 @@ def test_k_mutual_info(): def test_rcot(): rcot = RCoT(data, random_fourier_xy=5, random_fourier_z=100) - independent_rcot = RCoT(data_independent, random_fourier_xy=5, random_fourier_z=100) - + independent_rcot = RCoT(independent_data, random_fourier_xy=5, random_fourier_z=100) p_value = rcot.pvalue("A", "B") independent_p_value = independent_rcot.pvalue("A", "B") + # Check whether the p-values are below the significance level assert p_value < 0.05 assert independent_p_value > 0.05 From 2d74e2fc406f26a1f59779653806ee816fb3f85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Casaj=C3=BAs=20Seti=C3=A9n?= Date: Tue, 3 Dec 2024 09:53:35 +0000 Subject: [PATCH 40/75] independence tests documented --- tests/learning/independence_tests/independence_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index ba02e64f..7cf1788e 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -31,6 +31,7 @@ def test_chi_square(): + """Test the chi-square independence test with discrete data""" chi_square = ChiSquare(discrete_data) independent_chi_square = ChiSquare(independent_discrete_data) @@ -44,6 +45,7 @@ def test_chi_square(): # RFE: Test true and false independence def test_linear_correlation(): + """Test the linear correlation independence test with normal data""" df = data[["A", "B"]] independent_df = independent_data[["A", "B"]] @@ -85,6 +87,7 @@ def test_linear_correlation(): def test_mutual_info(): + """Test the mutual information independence test with normal data""" mutual_info = MutualInformation(data) independent_mutual_info = MutualInformation(independent_data) @@ -101,6 +104,7 @@ def test_mutual_info(): def test_k_mutual_info(): + """Test the k-nearest neighbors mutual information independence test with normal data""" n_neighbors = 3 k_mutual_info = KMutualInformation(data, k=n_neighbors) independent_k_mutual_info = KMutualInformation(independent_data, k=n_neighbors) @@ -118,6 +122,7 @@ def test_k_mutual_info(): assert independent_pvalue > 0.05 # RFE: Results vary with scikit-learn, why? + # sklearn_k_mutual_info_value = mutual_info_regression( # data[["A"]], data["B"], n_neighbors=n_neighbors # )[0] @@ -129,9 +134,11 @@ def test_k_mutual_info(): # rtol=1e-5, # atol=1e-8, # ) + # RFE: Test alternative https://github.com/syanga/pycit def test_rcot(): + """Test the Randomized Conditional Correlation Test (RCoT) independence test with normal data""" rcot = RCoT(data, random_fourier_xy=5, random_fourier_z=100) independent_rcot = RCoT(independent_data, random_fourier_xy=5, random_fourier_z=100) p_value = rcot.pvalue("A", "B") From 1d77eab832733fa78b5e08e0bc35f1e866937fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Casaj=C3=BAs=20Seti=C3=A9n?= Date: Tue, 3 Dec 2024 09:58:00 +0000 Subject: [PATCH 41/75] Independence test summary --- .../learning/independence_tests/independence_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 7cf1788e..49fcf3c9 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -29,6 +29,18 @@ discrete_data = generate_discrete_data(SIZE, SEED) independent_discrete_data = generate_discrete_data_independent(SIZE, SEED) +# INDEPENDENCE TESTS +# The null hypothesis (H0​) is that the two variables are independent, +# while the alternative hypothesis (H1) is that the two variables are dependent +# +# - If the p-value is less than or equal to the chosen significance level (usually 0.05), +# you reject the null hypothesis (H0H) in favor of the alternative hypothesis (H1). +# This suggests that there is a statistically significant association between the two variables. +# +# - If the p-value is greater than the significance level, you do not reject the null hypothesis. +# This indicates that there is insufficient evidence to conclude that the variables are dependent, +# and it is plausible that they are independent + def test_chi_square(): """Test the chi-square independence test with discrete data""" From b418bc306bac175389b1504aedaaf2902b254427 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 11 Dec 2024 17:18:09 +0100 Subject: [PATCH 42/75] data MACROs --- tests/helpers/data.py | 1 + .../independence_tests/independence_test.py | 24 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index c68e1567..7aa0063e 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -4,6 +4,7 @@ TRUE_LABEL = "class_label" DATA_SIZE = 10000 SEED = 0 +N_NEIGHBORS = 3 def generate_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 49fcf3c9..617f6349 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -2,6 +2,15 @@ import numpy as np import pandas as pd +from data import ( + N_NEIGHBORS, + SEED, + SIZE, + generate_discrete_data, + generate_discrete_data_independent, + generate_normal_data, + generate_normal_data_independent, +) from pybnesian import ( ChiSquare, KMutualInformation, @@ -11,18 +20,8 @@ ) from scipy.stats import pearsonr -from data import ( - generate_discrete_data, - generate_discrete_data_independent, - generate_normal_data, - generate_normal_data_independent, -) - # from sklearn.feature_selection import mutual_info_regression - -SIZE = 10000 -SEED = 0 data = generate_normal_data(SIZE, SEED) independent_data = generate_normal_data_independent(SIZE, SEED) @@ -117,9 +116,8 @@ def test_mutual_info(): def test_k_mutual_info(): """Test the k-nearest neighbors mutual information independence test with normal data""" - n_neighbors = 3 - k_mutual_info = KMutualInformation(data, k=n_neighbors) - independent_k_mutual_info = KMutualInformation(independent_data, k=n_neighbors) + k_mutual_info = KMutualInformation(data, k=N_NEIGHBORS) + independent_k_mutual_info = KMutualInformation(independent_data, k=N_NEIGHBORS) # Check whether the mutual information is higher when the variables are dependent k_mutual_info_value = k_mutual_info.mi("A", "B") From 6e1cb57b4f82e6be26592ec28e84fc3dd69245d3 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 12 Dec 2024 17:19:35 +0100 Subject: [PATCH 43/75] typo fix --- tests/learning/independence_tests/independence_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 617f6349..5b4cb97c 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -3,9 +3,9 @@ import numpy as np import pandas as pd from data import ( + DATA_SIZE, N_NEIGHBORS, SEED, - SIZE, generate_discrete_data, generate_discrete_data_independent, generate_normal_data, @@ -22,11 +22,11 @@ # from sklearn.feature_selection import mutual_info_regression -data = generate_normal_data(SIZE, SEED) -independent_data = generate_normal_data_independent(SIZE, SEED) +data = generate_normal_data(DATA_SIZE, SEED) +independent_data = generate_normal_data_independent(DATA_SIZE, SEED) -discrete_data = generate_discrete_data(SIZE, SEED) -independent_discrete_data = generate_discrete_data_independent(SIZE, SEED) +discrete_data = generate_discrete_data(DATA_SIZE, SEED) +independent_discrete_data = generate_discrete_data_independent(DATA_SIZE, SEED) # INDEPENDENCE TESTS # The null hypothesis (H0​) is that the two variables are independent, From 38968113baea512d1b2e16b2d2868efd782eab86 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 20 Dec 2024 13:51:09 +0100 Subject: [PATCH 44/75] warning ignored --- lib/eigen-3.3.7/debug/gdb/printers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/eigen-3.3.7/debug/gdb/printers.py b/lib/eigen-3.3.7/debug/gdb/printers.py index 2b5f9f1b..4869e948 100644 --- a/lib/eigen-3.3.7/debug/gdb/printers.py +++ b/lib/eigen-3.3.7/debug/gdb/printers.py @@ -28,7 +28,7 @@ import re -import gdb +import gdb # type: ignore class EigenMatrixPrinter: From e78146edd59bf5e4c006e8688fd14b8608491e3b Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 24 Jan 2025 14:30:59 +0100 Subject: [PATCH 45/75] Hybrid MI numerical limits fix --- .../hybrid/mutual_information.cpp | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mutual_information.cpp b/pybnesian/learning/independences/hybrid/mutual_information.cpp index 4e5d5c87..9f69f0a4 100644 --- a/pybnesian/learning/independences/hybrid/mutual_information.cpp +++ b/pybnesian/learning/independences/hybrid/mutual_information.cpp @@ -1029,7 +1029,7 @@ double MutualInformation::mi_mixed_impl(const std::string& discrete, const std:: } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::mi_mixed(const std::string& discrete, const std::string& continuous) const { @@ -1126,6 +1126,10 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y) con auto mi_value = mi(x, y); // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } auto df = calculate_df(x, y); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1276,7 +1280,7 @@ double MutualInformation::cmi_discrete_continuous_impl(const std::string& x, // Sum - H(Z) mi -= 0.5 + 0.5 * std::log(2 * util::pi * total_variance); - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_discrete_continuous(const std::string& x, @@ -1378,6 +1382,10 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con auto mi_value = mi(x, y, z); // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y, z); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } auto df = calculate_df(x, y, z); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1527,7 +1535,7 @@ double MutualInformation::cmi_general_both_discrete(const std::string& x, mi -= pz * h_z; } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_mixed(const std::string& x_discrete, @@ -1584,7 +1592,7 @@ double MutualInformation::cmi_general_mixed(const std::string& x_discrete, } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_both_continuous(const std::string& x, @@ -1621,7 +1629,7 @@ double MutualInformation::cmi_general_both_continuous(const std::string& x, } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general(const std::string& x, @@ -1744,6 +1752,10 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con auto mi_value = cmi_general(x, y, discrete_z, continuous_z); // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y, z); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } auto df = calculate_df(x, y, discrete_z, continuous_z); boost::math::chi_squared_distribution chidist(static_cast(df)); From 40b29f208004c42375fc9a28501798faeb71cb79 Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 24 Jan 2025 14:39:40 +0100 Subject: [PATCH 46/75] DiscreteFactor MLE Laplace Smoothing --- pybnesian/learning/parameters/mle_DiscreteFactor.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index 6b62c8f7..c6f13776 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -30,9 +30,11 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, logprob(offset + i) = loguniform; } } else { - double logsum_configuration = std::log(static_cast(sum_configuration)); + // Laplace Smoothing, lambda = 1 (uniform prior) + int lambda = 1; + double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { - logprob(offset + i) = std::log(static_cast(joint_counts(offset + i))) - logsum_configuration; + logprob(offset + i) = std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; } } } From a8fdc321f3c0ff5439ceb3f67e291ff68736569d Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 7 Mar 2025 14:31:39 +0100 Subject: [PATCH 47/75] MixedKCMI & VPTrees --- docs/source/api/learning/independences.rst | 9 +- .../independences/hybrid/mixed_knncmi.cpp | 655 ++++++++++++++++++ .../independences/hybrid/mixed_knncmi.hpp | 102 +++ .../pybindings_independences.cpp | 109 ++- pybnesian/vptree/vptree.cpp | 635 +++++++++++++++++ pybnesian/vptree/vptree.hpp | 148 ++++ 6 files changed, 1656 insertions(+), 2 deletions(-) create mode 100644 pybnesian/learning/independences/hybrid/mixed_knncmi.cpp create mode 100644 pybnesian/learning/independences/hybrid/mixed_knncmi.hpp create mode 100644 pybnesian/vptree/vptree.cpp create mode 100644 pybnesian/vptree/vptree.hpp diff --git a/docs/source/api/learning/independences.rst b/docs/source/api/learning/independences.rst index dee80b18..1a52211f 100644 --- a/docs/source/api/learning/independences.rst +++ b/docs/source/api/learning/independences.rst @@ -76,4 +76,11 @@ Bibliography 938–947. .. [RCoT] Strobl, E. V., Zhang, K., & Visweswaran, S. (2019). Approximate kernel-based conditional independence tests - for fast non-parametric causal discovery. Journal of Causal Inference, 7(1). \ No newline at end of file + for fast non-parametric causal discovery. Journal of Causal Inference, 7(1). + +.. [MSCMI] [1] Mesner, O. C. and Shalizi C. R. (2021) Conditional mutual information estimation for mixed, discrete and + continuous data. IEEE Transactions on Information Theory, 67(1), 464–484. + +.. [MixedCMIKnn] [1] Popescu, O.-I., Gerhardus, A. & Runge, J. (2023). Non-parametric conditional independence testing for + mixed continuous-categorical variables: A novel method and numerical evaluation. arXiv pre-print. + Available: https://arxiv.org/abs/2310.11132 \ No newline at end of file diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp new file mode 100644 index 00000000..f3a07fae --- /dev/null +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -0,0 +1,655 @@ +#include +#include +#include +#include +#include + +using Array_ptr = std::shared_ptr; +using vptree::hash_columns; + +namespace learning::independences::hybrid { + +template +DataFrame scale_data(const DataFrame& df, const std::string& scaling) { + using ArrayType = typename arrow::TypeTraits::ArrayType; + using CType = typename ArrowType::c_type; + using kdtree::IndexComparator; + + arrow::SchemaBuilder b(arrow::SchemaBuilder::ConflictPolicy::CONFLICT_ERROR); + std::vector new_columns; + + arrow::NumericBuilder builder; + auto n_rows = df->num_rows(); + + std::vector indices(n_rows); + std::iota(indices.begin(), indices.end(), 0); + + std::vector ranked_data(n_rows); + + for (int j = 0; j < df->num_columns(); ++j) { + auto column = df.col(j); + auto dt = column->type_id(); + switch (dt) { + // discrete variables are kept as their dictionary indices + case Type::DICTIONARY: { + auto column_cast = std::static_pointer_cast(column); + auto indices = std::static_pointer_cast(column_cast->indices()); + for (int i = 0; i < n_rows; ++i) { + RAISE_STATUS_ERROR(builder.Append(static_cast(indices->Value(i)))); + } + break; + } + // transform only the continuous variables + default: { + if (scaling == "normalized_rank") { + auto dwn = df.downcast(j); + auto raw_values = dwn->raw_values(); + + IndexComparator comp(raw_values); + std::sort(indices.begin(), indices.end(), comp); + + for (int i = 0; i < n_rows; ++i) { + ranked_data[indices[i]] = static_cast(i) / static_cast(n_rows - 1); + } + + RAISE_STATUS_ERROR(builder.AppendValues(ranked_data.begin(), ranked_data.end())); + + } else if (scaling == "min_max") { + auto column_cast = std::static_pointer_cast(column); + auto min = df.min(j); + auto max = df.max(j); + if (max != min) { + for (int i = 0; i < n_rows; ++i) { + auto normalized_value = (column_cast->Value(i) - min) / (max - min); + RAISE_STATUS_ERROR(builder.Append(normalized_value)); + } + } else { + throw std::invalid_argument("Constant column in DataFrame."); + } + + } else { + throw std::invalid_argument("Invalid scaling option, must be either normalized_rank or min_max."); + } + } + } + Array_ptr out; + RAISE_STATUS_ERROR(builder.Finish(&out)); + new_columns.push_back(out); + builder.Reset(); + + auto f = arrow::field(df.name(j), out->type()); + RAISE_STATUS_ERROR(b.AddField(f)); + } + + RAISE_RESULT_ERROR(auto schema, b.Finish()) + + auto rb = arrow::RecordBatch::Make(schema, n_rows, new_columns); + return DataFrame(rb); +} + +DataFrame scale_data(const DataFrame& df, const std::string& scaling) { + // check continuous columns dtype + auto cont_cols = df.continuous_columns(); + std::shared_ptr dt; + if (cont_cols.size() > 0) { + dt = df.loc(cont_cols).same_type(); + } else { + // if fully discrete use smaller dtype + dt = std::static_pointer_cast(arrow::float32()); + } + switch (dt->id()) { + case Type::DOUBLE: + return scale_data(df, scaling); + case Type::FLOAT: + return scale_data(df, scaling); + default: + throw std::invalid_argument("Wrong data type in MixedKMutualInformation."); + } +} + +double mi_general(VPTree& ztree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed) { + auto n_rows = df->num_rows(); + VPTree vptree(df, datatype, is_discrete_column, tree_leafsize, seed); + // excluding the reference point which is not a neighbor of itself + auto knn_results = vptree.query(df, k + 1); + + VectorXd eps(n_rows); + VectorXi k_hat(n_rows); + for (auto i = 0; i < n_rows; ++i) { + eps(i) = knn_results[i].first(k); + k_hat(i) = knn_results[i].second.size(); + } + + // use the ztree to search in all Z, XZ and YZ subspaces + auto [n_xz, n_yz, n_z] = ztree.count_ball_subspaces(df, eps, is_discrete_column); + + double res = 0; + auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; + + for (int i = 0; i < n_rows; ++i) { + res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(exclude_self(n_z(i))) - + boost::math::digamma(exclude_self(n_xz(i))) - boost::math::digamma(exclude_self(n_yz(i))); + } + + res /= n_rows; + + return res; +} + +double mi_pair(VPTree& ytree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed) { + auto n_rows = df->num_rows(); + VPTree xytree(df, datatype, is_discrete_column, tree_leafsize, seed); + // excluding the reference point which is not a neighbor of itself + auto knn_results = xytree.query(df, k + 1); + + VectorXd eps(n_rows); + VectorXi k_hat(n_rows); + for (auto i = 0; i < n_rows; ++i) { + eps(i) = knn_results[i].first[k]; + k_hat(i) = knn_results[i].second.size(); + } + + auto x_is_discrete_column = std::vector(is_discrete_column.begin(), is_discrete_column.begin() + 1); + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + + auto x_df = df.loc(0); + auto y_df = df.loc(1); + + VPTree xtree(x_df, datatype, x_is_discrete_column, tree_leafsize, seed); + + auto n_x = xtree.count_ball_unconditional(x_df, eps, x_is_discrete_column); + auto n_y = ytree.count_ball_unconditional(y_df, eps, y_is_discrete_column); + + double res = 0; + auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; + + for (int i = 0; i < n_rows; ++i) { + // Z is treated as a constant column, thus n_z = n_rows - 1 + res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(n_rows - 1) - + boost::math::digamma(exclude_self(n_x(i))) - boost::math::digamma(exclude_self(n_y(i))); + } + + res /= n_rows; + + return res; +} + +int MixedKMutualInformation::find_minimum_cluster_size(const std::vector& discrete_vars) const { + auto dummy_vars = std::vector(discrete_vars.begin() + 1, discrete_vars.end()); + + auto [cardinality, strides] = factors::discrete::create_cardinality_strides(m_df, discrete_vars); + + auto joint_counts = factors::discrete::joint_counts(m_df, discrete_vars[0], dummy_vars, cardinality, strides); + + int min_cluster_size = std::numeric_limits::max(); + + // find minimum positive cluster size + for (int i = 0; i < joint_counts.size(); ++i) { + if (joint_counts[i] > 0 && joint_counts[i] < min_cluster_size) { + min_cluster_size = joint_counts[i]; + } + } + + return min_cluster_size; +} + +int MixedKMutualInformation::find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, + const std::vector& discrete_vars) const { + // hash the columns as they are no longer of type arrow::DictionaryArray + std::unordered_map joint_counts; + switch (m_datatype->id()) { + case Type::FLOAT: { + auto data = shuffled_df.downcast_vector(discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars); + for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { + joint_counts[hashed_cols[i]]++; + } + break; + } + default: { + auto data = shuffled_df.downcast_vector(discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars); + for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { + joint_counts[hashed_cols[i]]++; + } + } + } + int min_cluster_size = std::numeric_limits::max(); + + // find minimum positive cluster size + for (const auto& [config, count] : joint_counts) { + if (count < min_cluster_size) { + min_cluster_size = count; + } + } + + return min_cluster_size; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y) { + is_discrete_column.push_back(df.is_discrete(x)); + is_discrete_column.push_back(df.is_discrete(y)); + + std::vector discrete_vars; + if (is_discrete_column[0]) { + discrete_vars.push_back(x); + discrete_present = true; + } + if (is_discrete_column[1]) { + discrete_vars.push_back(y); + discrete_present = true; + } + return discrete_vars; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y, + const std::string& z) { + auto discrete_vars = check_discrete_cols(df, is_discrete_column, discrete_present, x, y); + is_discrete_column.push_back(df.is_discrete(z)); + + if (is_discrete_column.back()) { + discrete_vars.push_back(z); + discrete_present = true; + } + + return discrete_vars; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y, + const std::vector& z) { + auto discrete_vars = check_discrete_cols(df, is_discrete_column, discrete_present, x, y); + for (const auto& col : z) { + is_discrete_column.push_back(df.is_discrete(col)); + if (is_discrete_column.back()) { + discrete_vars.push_back(col); + discrete_present = true; + } + } + + return discrete_vars; +} + +double MixedKMutualInformation::mi(const std::string& x, const std::string& y) const { + auto subset_df = m_scaled_df.loc(x, y); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + auto y_df = subset_df.loc(1); + VPTree ytree(y_df, m_datatype, y_is_discrete_column, m_tree_leafsize, m_seed); + + return mi_pair(ytree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::string& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto z_df = subset_df.loc(2); + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::vector& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto z_df = m_scaled_df.loc(z); + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double compute_mean(const std::vector& data) { + return std::accumulate(data.begin(), data.end(), 0.0) / data.size(); +} + +double compute_variance(const std::vector& data, double mean) { + double variance = 0.0; + for (double x : data) { + variance += std::pow((x - mean), 2); + } + return variance / data.size(); +} + +double compute_skewness(const std::vector& data, double mean, double variance) { + double skewness = 0.0; + for (double x : data) { + skewness += std::pow(x - mean, 3); + } + return (skewness / data.size()) / std::pow(variance, 1.5); +} + +double compute_pvalue(double original_mi, std::vector& permutation_stats, bool gamma_approx) { + double min_value = *std::min_element(permutation_stats.begin(), permutation_stats.end()); + double max_value = *std::max_element(permutation_stats.begin(), permutation_stats.end()); + + if (original_mi > max_value) { + return 0.0; + } else if (original_mi <= min_value) { + return 1.0; + } + + if (gamma_approx) { + // small positive value to ensure positivity + double epsilon = std::numeric_limits::epsilon(); + std::vector shifted_data; + // shift statistics to the positive interval + for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { + permutation_stats[i] = permutation_stats[i] - min_value + epsilon; + } + + double mean = compute_mean(permutation_stats); + double variance = compute_variance(permutation_stats, mean); + double skewness = compute_skewness(permutation_stats, mean, variance); + + double shape, scale; + shape = (mean * mean) / variance; + scale = variance / mean; + + // fit gamma using method of moments + boost::math::gamma_distribution<> gamma_dist(shape, scale); + + // use the fitted gamma distribution to compute the p-value + if (skewness > 0) { + return 1 - boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); + } + + return boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); + } + + // crude p-value computation + int count_greater = 0; + + for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { + if (permutation_stats[i] >= original_mi) ++count_greater; + } + + return static_cast(count_greater) / permutation_stats.size(); +} + +double MixedKMutualInformation::pvalue(const std::string& x, const std::string& y) const { + std::mt19937 rng{m_seed}; + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y); + + // the adaptive k affects both the CMI estimates and the shuffling + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y); + auto y_df = shuffled_df.loc(1); + + // reuse the ytree as the Y column will not be shuffled + VPTree ytree(y_df, m_datatype, y_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + std::vector permutation_stats(m_samples); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto x_begin = shuffled_df.template mutable_data(0); + auto x_end = x_begin + shuffled_df->num_rows(); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(x_begin, x_end, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size); + } + auto shuffled_value = + mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + permutation_stats[i] = shuffled_value; + } + break; + } + + default: { + auto x_begin = shuffled_df.template mutable_data(0); + auto x_end = x_begin + shuffled_df->num_rows(); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(x_begin, x_end, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size); + } + auto shuffled_value = + mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + permutation_stats[i] = shuffled_value; + } + } + } + + return compute_pvalue(original_mi, permutation_stats, m_gamma_approx); +} + +double MixedKMutualInformation::pvalue(const std::string& x, const std::string& y, const std::string& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + int shuffle_neighbors = m_shuffle_neighbors; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size); + } + + auto x_df = subset_df.loc(0); + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y, z); + auto z_df = shuffled_df.loc(2); + + // reuse the ztree as the Z column will not be shuffled + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + return shuffled_pvalue( + original_mi, k, shuffle_neighbors, x_df, ztree, z_df, shuffled_df, is_discrete_column, discrete_vars); +} + +double MixedKMutualInformation::pvalue(const std::string& x, + const std::string& y, + const std::vector& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + int shuffle_neighbors = m_shuffle_neighbors; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + // the adaptive k affects both the CMI estimates and the shuffling + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size); + } + + auto x_df = subset_df.loc(0); + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y, z); + auto z_df = shuffled_df.loc(z); + + // reuse the ztree as the Z column will not be shuffled + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + return shuffled_pvalue( + original_mi, k, shuffle_neighbors, x_df, ztree, z_df, shuffled_df, is_discrete_column, discrete_vars); +} + +/* tries to perform shuffling without replacement */ +template +void shuffle_dataframe(const CType* original_x, + CType* shuffled_x, + const std::vector& order, + std::vector& used, + std::vector& neighbors, + Random& rng) { + // first shuffle the neighbors found in the Z subspace + for (auto& neighbor_list : neighbors) { + auto begin = neighbor_list.data(); + auto end = begin + neighbor_list.size(); + std::shuffle(begin, end, rng); + } + + // using the random order, replace instance with the first unused shuffled neighbor + for (long unsigned int i = 0; i < order.size(); ++i) { + size_t index = order[i]; + int neighbor_index = 0; + long int j = 0; + + for (; j < neighbors[index].size(); ++j) { + neighbor_index = neighbors[index][j]; + if (!used[neighbor_index]) { + break; + } + } + + // if there were collisions, keep instance with original value + if (j == neighbors[index].size()) neighbor_index = index; + + shuffled_x[index] = original_x[neighbor_index]; + used[neighbor_index] = true; + } +} + +double MixedKMutualInformation::shuffled_pvalue(double original_mi, + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const { + std::minstd_rand rng{m_seed}; + std::vector neighbors(m_df->num_rows()); + + auto zknn = ztree.query(z_df, shuffle_neighbors); + + for (size_t i = 0; i < zknn.size(); ++i) { + neighbors[i] = zknn[i].second; + } + + std::vector order(m_df->num_rows()); + std::iota(order.begin(), order.end(), 0); + + std::vector used(m_df->num_rows(), false); + std::vector permutation_stats(m_samples); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto original_x = x_df.template data(0); + auto shuffled_x = shuffled_df.template mutable_data(0); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(order.begin(), order.end(), rng); + shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto shuffled_value = + mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + permutation_stats[i] = shuffled_value; + + std::fill(used.begin(), used.end(), false); + } + break; + } + + default: { + auto original_x = x_df.template data(0); + auto shuffled_x = shuffled_df.template mutable_data(0); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(order.begin(), order.end(), rng); + shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size); + } + + auto shuffled_value = + mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + permutation_stats[i] = shuffled_value; + + std::fill(used.begin(), used.end(), false); + } + } + } + + return compute_pvalue(original_mi, permutation_stats, m_gamma_approx); +} + +} // namespace learning::independences::hybrid \ No newline at end of file diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp new file mode 100644 index 00000000..10346ea1 --- /dev/null +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -0,0 +1,102 @@ +#ifndef PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP +#define PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP + +#include +#include +#include +#include + +using dataset::DataFrame, dataset::Copy; +using Eigen::MatrixXi; +using Array_ptr = std::shared_ptr; +using vptree::VPTree; + +namespace learning::independences::hybrid { +DataFrame scale_data(const DataFrame& df, const std::string& scaling); + +double mi_general(VPTree& ztree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed); +double mi_pair(VPTree& ytree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed); + +class MixedKMutualInformation : public IndependenceTest { +public: + MixedKMutualInformation(DataFrame df, + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) + : m_df(df), + m_scaled_df(scale_data(df, scaling)), + m_datatype(), + m_k(k), + m_seed(seed), + m_shuffle_neighbors(shuffle_neighbors), + m_samples(samples), + m_gamma_approx(gamma_approx), + m_adaptive_k(adaptive_k), + m_tree_leafsize(tree_leafsize) { + m_datatype = m_scaled_df.same_type(); + } + + double pvalue(const std::string& x, const std::string& y) const override; + double pvalue(const std::string& x, const std::string& y, const std::string& z) const override; + double pvalue(const std::string& x, const std::string& y, const std::vector& z) const override; + + double mi(const std::string& x, const std::string& y) const; + double mi(const std::string& x, const std::string& y, const std::string& z) const; + double mi(const std::string& x, const std::string& y, const std::vector& z) const; + + int num_variables() const override { return m_df->num_columns(); } + + std::vector variable_names() const override { return m_df.column_names(); } + + const std::string& name(int i) const override { return m_df.name(i); } + + bool has_variables(const std::string& name) const override { return m_df.has_columns(name); } + + bool has_variables(const std::vector& cols) const override { return m_df.has_columns(cols); } + +private: + double shuffled_pvalue(double original_mi, + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const; + + int find_minimum_cluster_size(const std::vector& discrete_vars) const; + int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, + const std::vector& discrete_vars) const; + DataFrame m_df; + DataFrame m_scaled_df; + std::shared_ptr m_datatype; + int m_k; + unsigned int m_seed; + int m_shuffle_neighbors; + int m_samples; + bool m_gamma_approx; + bool m_adaptive_k; + int m_tree_leafsize; +}; + +} // namespace learning::independences::hybrid + +#endif // PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP \ No newline at end of file diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp index 481922d5..d9421676 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp @@ -7,13 +7,15 @@ #include #include #include +#include #include namespace py = pybind11; using learning::independences::IndependenceTest, learning::independences::continuous::LinearCorrelation, learning::independences::continuous::KMutualInformation, learning::independences::continuous::RCoT, - learning::independences::discrete::ChiSquare, learning::independences::hybrid::MutualInformation; + learning::independences::discrete::ChiSquare, learning::independences::hybrid::MutualInformation, + learning::independences::hybrid::MixedKMutualInformation; using learning::independences::DynamicIndependenceTest, learning::independences::continuous::DynamicLinearCorrelation, learning::independences::continuous::DynamicKMutualInformation, learning::independences::continuous::DynamicRCoT, @@ -525,4 +527,109 @@ Initializes a :class:`DynamicChiSquare` with the given :class:`DynamicDataFrame` :param ddf: :class:`DynamicDataFrame` to create the :class:`DynamicChiSquare`. )doc"); + +py::class_>( + root, "MixedKMutualInformation", R"doc( +This class implements a non-parametric independence test that is based on the estimation of the mutual information +using k-nearest neighbors, accelerated using vantage-point trees (VP-Trees). This independence is implemented for a mix of categorical and continuous data. + +This independence test is based on both [MSCMI]_ and [MixedCMIKnn]_. +)doc") + .def(py::init([](DataFrame df, + int k, + std::optional seed, + int shuffle_neighbors, + int samples, + std::string scaling, + bool gamma_approx, + bool adaptive_k, + int tree_leafsize) { + if (scaling != "normalized_rank" && scaling != "min_max") { + throw std::invalid_argument("scaling must be either 'min_max' or 'normalized_rank'"); + } + return MixedKMutualInformation(df, + k, + random_seed_arg(seed), + shuffle_neighbors, + samples, + scaling, + gamma_approx, + adaptive_k, + tree_leafsize); + }), + py::arg("df"), + py::arg("k") = 10, + py::arg("seed") = std::nullopt, + py::arg("shuffle_neighbors") = 5, + py::arg("samples") = 1000, + py::arg("scaling") = "min_max", + py::arg("gamma_approx") = true, + py::arg("adaptive_k") = true, + py::arg("tree_leafsize") = 16, + R"doc( +Initializes a :class:`MixedKMutualInformation` for data ``df``. ``k`` is the number of neighbors in the k-nn model used to +estimate the mutual information. + +This is a permutation independence test, so ``samples`` defines the number of permutations. ``shuffle neighbors`` +(:math:`k_{perm}` in the original paper [MixedCMIKnn]_) defines how many neighbors are used to perform the conditional +permutations. ``adaptive k`` enforces an upper bound for both ``k`` and ``shuffle neighbors``, so they are not greater +than the smallest cluster size (discrete configuration), as suggested in [MixedCMIKnn]_. + +:param df: DataFrame on which to calculate the independence tests. +:param k: number of neighbors in the k-nn model used to estimate the mutual information. +:param seed: A random seed number. If not specified or ``None``, a random seed is generated. +:param shuffle_neighbors: Number of neighbors used to perform the conditional permutation. +:param samples: Number of permutations for the :class:`MixedKMutualInformation`. +:param scaling: Transformation for the continuous variables to the [0,1] range. Can be either "min_max" or "normalized_rank". +:param gamma_approx: Whether or not to approximate the p-value by fitting a gamma distribution with the first three moments of the permutation statistics. +:param adaptive_k: If set to ``True``, upper bounds both ``k`` and ``shuffle neighbors`` to the minimum discrete configuration size, as in [MixedCMIKnn]_. If set to ``False``, + allows the k-nn model to consider dependencies between distinct discrete values, and is more biased towards zero estimates as in [MSCMI]_. +:param tree_leafsize: Maximum size for the VP-Tree leaves to abandon pruning for a brute force approach. + +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, const std::string& x, const std::string& y) { return self.mi(x, y); }, + py::arg("x"), + py::arg("y"), + R"doc( +Estimates the unconditional mutual information :math:`\text{MI}(x, y)`. + +:param x: A variable name. +:param y: A variable name. +:returns: The unconditional mutual information :math:`\text{MI}(x, y)`. +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, const std::string& x, const std::string& y, const std::string& z) { + return self.mi(x, y, z); + }, + py::arg("x"), + py::arg("y"), + py::arg("z"), + R"doc( +Estimates the univariate conditional mutual information :math:`\text{MI}(x, y \mid z)`. + +:param x: A variable name. +:param y: A variable name. +:param z: A variable name. +:returns: The univariate conditional mutual information :math:`\text{MI}(x, y \mid z)`. +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, + const std::string& x, + const std::string& y, + const std::vector& z) { return self.mi(x, y, z); }, + py::arg("x"), + py::arg("y"), + py::arg("z"), + R"doc( +Estimates the multivariate conditional mutual information :math:`\text{MI}(x, y \mid \mathbf{z})`. + +:param x: A variable name. +:param y: A variable name. +:param z: A list of variable names. +:returns: The multivariate conditional mutual information :math:`\text{MI}(x, y \mid \mathbf{z})`. +)doc"); } diff --git a/pybnesian/vptree/vptree.cpp b/pybnesian/vptree/vptree.cpp new file mode 100644 index 00000000..7b4e008e --- /dev/null +++ b/pybnesian/vptree/vptree.cpp @@ -0,0 +1,635 @@ +#include + +namespace vptree { + +template +using Neighbor = std::pair; + +template +struct NeighborComparator { + inline bool operator()(const Neighbor& a, const Neighbor& b) { + return a.first < b.first; // max-heap + } +}; + +template +using NeighborQueue = + std::priority_queue, std::vector>, NeighborComparator>; + +template +struct QueryNode { + VPTreeNode* node; + typename ArrowType::c_type min_distance; +}; + +template +struct QueryNodeComparator { + inline bool operator()(const QueryNode& a, const QueryNode& b) { + return a.min_distance > b.min_distance; // closer neighbors are visited first + } +}; + +template +using QueryQueue = + std::priority_queue, std::vector>, QueryNodeComparator>; + +template +std::unique_ptr build_vptree(const HybridChebyshevDistance& distance, + std::vector& indices_parent, + int leafsize, + Random& rng) { + using CType = typename ArrowType::c_type; + + // ending conditions of the recursion + if (indices_parent.empty()) return nullptr; + + if (indices_parent.size() <= static_cast(leafsize)) { + auto leaf = std::make_unique(); + leaf->threshold = 0.0; + leaf->is_leaf = true; + leaf->leaf_indices = indices_parent; + return leaf; + } + + size_t rand_selection = std::uniform_int_distribution(0, indices_parent.size() - 1)(rng); + std::iter_swap(indices_parent.begin() + rand_selection, indices_parent.begin()); + size_t vp_index = indices_parent[0]; + + std::vector> distances_indices(indices_parent.size() - 1); + + CType max = 0; + + // compute distances against the vantange point + for (size_t i = 1; i < indices_parent.size(); ++i) { + auto dist = distance.distance(indices_parent[i], vp_index); + distances_indices[i - 1] = std::make_pair(dist, indices_parent[i]); + if (dist > max) max = dist; + } + + // super-leaf for configurations where all points overlap + if (max == 0) { + auto leaf = std::make_unique(); + leaf->threshold = 0.0; + leaf->is_leaf = true; + leaf->leaf_indices = indices_parent; + + return leaf; + } + + // search for a distance of value 1 + auto it = std::find_if(distances_indices.begin(), distances_indices.end(), [](const std::pair& p) { + return p.first == 1; // Check if any distance is 1 + }); + + // prioritize discrete splits + double threshold = 1.0; + + if (it == distances_indices.end()) { + // if none, node radius is the median + std::nth_element( + distances_indices.begin(), + distances_indices.begin() + distances_indices.size() / 2, + distances_indices.end(), + [](const std::pair& a, const std::pair& b) { return a.first > b.first; }); + threshold = distances_indices[distances_indices.size() / 2].first; + } + + std::vector indices_left, indices_right; + + /*notice how placing the >= on the right child does not affect continuous splits, + but significantly improves the discrete splits, which are binary {0,1}*/ + for (size_t i = 0; i < distances_indices.size(); ++i) { + if (distances_indices[i].first < threshold) { + indices_left.push_back(distances_indices[i].second); + } else { + indices_right.push_back(distances_indices[i].second); + } + } + + auto node = std::make_unique(); + + node->index = vp_index; + node->threshold = threshold; + node->is_leaf = false; + + node->left = build_vptree(distance, indices_left, leafsize, rng); + node->right = build_vptree(distance, indices_right, leafsize, rng); + + return node; +} + +std::unique_ptr VPTree::build_vptree(const DataFrame& df, + const std::shared_ptr datatype, + const std::vector& is_discrete_column, + int leafsize, + unsigned int seed) { + std::vector indices(m_df->num_rows()); + std::iota(indices.begin(), indices.end(), 0); + std::mt19937 rng{seed}; + switch (datatype->id()) { + case Type::DOUBLE: { + auto data = df.downcast_vector(); + + HybridChebyshevDistance distance(data, is_discrete_column); + return vptree::build_vptree(distance, indices, leafsize, rng); + } + case Type::FLOAT: { + auto data = df.downcast_vector(); + + HybridChebyshevDistance distance(data, is_discrete_column); + return vptree::build_vptree(distance, indices, leafsize, rng); + } + default: + throw std::invalid_argument("Wrong data type to apply VPTree."); + } +} + +std::vector> VPTree::query(const DataFrame& test_df, int k) const { + if (k >= m_df->num_rows()) { + throw std::invalid_argument("\"k\" value equal or greater to training data size."); + } + + test_df.raise_has_columns(m_column_names); + + std::vector> res(test_df->num_rows()); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance dist(test, m_is_discrete_column); + + auto hash_keys = hash_columns(test, m_column_names); + + for (int i = 0; i < test_df->num_rows(); ++i) { + auto key = hash_keys[i]; + + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + res[i] = it->second; + // Skip the query, use cached result + } else { + auto t = query_instance(i, k, dist); + res[i] = t; + + m_query_cache[key] = t; + } + } + + break; + } + + default: { + auto test = test_df.downcast_vector(); + + HybridChebyshevDistance dist(test, m_is_discrete_column); + + auto hash_keys = hash_columns(test, m_column_names); + for (int i = 0; i < test_df->num_rows(); ++i) { + auto key = hash_keys[i]; + + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + res[i] = it->second; + // Skip the query, use cached result + } else { + auto t = query_instance(i, k, dist); + res[i] = t; + + m_query_cache[key] = t; + } + } + } + } + + // cleared because after permuting X the XYZ space will not be the same + m_query_cache.clear(); + + return res; +} + +std::tuple VPTree::count_ball_subspaces(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const { + test_df.raise_has_columns(m_column_names); + + auto n_rows = test_df->num_rows(); + VectorXi count_xz(n_rows); + VectorXi count_yz(n_rows); + VectorXi count_z(n_rows); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance_xyz(test, is_discrete_column); + + auto hash_keys = hash_columns(test, test_df.column_names()); + + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + // Skip the query, use cached result + } else { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + + m_count_cache[key] = c; + } + } + break; + } + default: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance_xyz(test, is_discrete_column); + + auto hash_keys = hash_columns(test, test_df.column_names()); + + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + // Skip the query, use cached result + } else { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + + m_count_cache[key] = c; + } + } + } + } + + // cleared because after permuting X the XYZ space will not be the same + m_count_cache.clear(); + + return std::make_tuple(count_xz, count_yz, count_z); +} + +template +std::vector vptree::hash_columns( + const std::vector::ArrayType>>& data, + std::vector column_names) { + int num_rows = data.empty() ? 0 : data[0]->length(); + std::vector row_hashes(num_rows, 0); + + size_t colnames_hash = boost::hash_range(column_names.begin(), column_names.end()); + + for (long unsigned int j = 0; j < data.size(); ++j) { + for (int i = 0; i < num_rows; ++i) { + auto value = data[j]->Value(i); + + boost::hash_combine(row_hashes[i], value); + } + } + + // column names are needed as the discrete values are all dummy dictionary keys 0,1,2... + for (int i = 0; i < num_rows; ++i) { + boost::hash_combine(row_hashes[i], colnames_hash); + } + + return row_hashes; +} + +VectorXi VPTree::count_ball_unconditional(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const { + test_df.raise_has_columns(m_column_names); + + auto n_rows = test_df->num_rows(); + VectorXi count_n(n_rows); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance(test, is_discrete_column); + + auto hash_keys = hash_columns(test, test_df.column_names()); + + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + count_n(i) = it->second; + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + + m_count_cache_unconditional[key] = c; + } + + break; + } + default: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance(test, is_discrete_column); + + auto hash_keys = hash_columns(test, test_df.column_names()); + + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + count_n(i) = it->second; + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + + m_count_cache_unconditional[key] = c; + } + } + } + + /*here we do not clear the cache since the Y subspace will not be permuted, + and recycled yTrees may benefit from it*/ + + return count_n; +} + +template +std::pair VPTree::query_instance(size_t i, + int k, + const HybridChebyshevDistance& distance) const { + using CType = typename ArrowType::c_type; + + // max-heap + NeighborQueue neighborhood; + + // list at the top of the max-heap, that allows storing neighbors tying at the knn distance + std::pair> neighborhood_star; + + CType distance_upper_bound = neighborhood_star.first = std::numeric_limits::infinity(), distance_neigh = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + CType min_distance = 0; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + distance_neigh = distance.distance(*it_neigh, i); + + if (neighborhood.size() == static_cast(k)) { + if (distance_neigh < distance_upper_bound) { + neighborhood.pop(); + neighborhood.push(std::make_pair(distance_neigh, *it_neigh)); + // check tying neighbors are still equal to the knn + if (neighborhood_star.first > neighborhood.top().first) { + neighborhood_star.second.clear(); + } + } else if (distance_neigh == distance_upper_bound) { + // process super-leaf values as one + if (num_neighbors > static_cast(m_leafsize)) { + neighborhood_star.second.reserve(neighborhood_star.second.size() + + std::distance(it_neigh, neigh_end)); + neighborhood_star.second.insert(neighborhood_star.second.end(), it_neigh, neigh_end); + neighborhood_star.first = distance_neigh; + break; + } else { + neighborhood_star.second.push_back(*it_neigh); + neighborhood_star.first = distance_neigh; + } + // process super-leaf values as one + } else if (num_neighbors > static_cast(m_leafsize)) + break; + } else { + neighborhood.push(std::make_pair(distance_neigh, *it_neigh)); + } + + if (neighborhood.size() == static_cast(k)) { + distance_upper_bound = neighborhood.top().first; + } + } + + // use triangular inequality to prune branches + CType left_min_distance = distance_neigh - node->threshold; + + // epsilon enforces inequality for discrete distances + if (left_min_distance == 0 && distance_neigh == 1) left_min_distance += std::numeric_limits::epsilon(); + + if (node->left && left_min_distance <= distance_upper_bound) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - distance_neigh; + + if (node->right && right_min_distance <= distance_upper_bound) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + auto k_hat = k + neighborhood_star.second.size(); + VectorXd distances(k); // just size k since the tying neighbors all share the same knn distance + VectorXi indices(k_hat); + + std::copy(neighborhood_star.second.begin(), + neighborhood_star.second.end(), + indices.data() + (k_hat - neighborhood_star.second.size())); + + auto u = k - 1; + while (!neighborhood.empty()) { + auto& neigh = neighborhood.top(); + distances(u) = neigh.first; + indices(u) = neigh.second; + neighborhood.pop(); + --u; + } + + return std::make_pair(distances, indices); +} + +template +std::tuple VPTree::count_ball_subspaces_instance( + size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance_xyz) const { + using CType = typename ArrowType::c_type; + + CType min_distance = 0, d_z = 0; + + int count_xz = 0, count_yz = 0, count_z = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + std::vector z_indices(m_df->num_columns()); + std::iota(z_indices.begin(), z_indices.end(), 2); + + std::vector x_index(1, 0); + std::vector y_index(1, 1); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + // trick: since Z is a subspace of XZ and YZ, we can constrain the vptree building and search just to Z, + // then check for X&Y + d_z = distance_xyz.distance_coords(*it_neigh, i, z_indices); + + if (d_z <= eps_value) { + if (num_neighbors <= static_cast(m_leafsize)) { + ++count_z; + if (distance_xyz.distance_coords(*it_neigh, i, x_index) <= eps_value) ++count_xz; + if (distance_xyz.distance_coords(*it_neigh, i, y_index) <= eps_value) ++count_yz; + } else { + // process super-leaf values as one, at least for Z + count_z += num_neighbors; + for (; it_neigh != neigh_end; ++it_neigh) { + if (distance_xyz.distance_coords(*it_neigh, i, x_index) <= eps_value) ++count_xz; + if (distance_xyz.distance_coords(*it_neigh, i, y_index) <= eps_value) ++count_yz; + } + break; + } + } else if (num_neighbors > static_cast(m_leafsize)) + // process super-leaf values as one + break; + } + + // use triangular inequality to prune branches + CType left_min_distance = d_z - node->threshold; + + // epsilon enforces inequality for discrete distances + if (left_min_distance == 0 && d_z == 1) left_min_distance += std::numeric_limits::epsilon(); + + if (node->left && left_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - d_z; + + if (node->right && right_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + return std::make_tuple(count_xz, count_yz, count_z); +} + +template +int VPTree::count_ball_unconditional_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const { + using CType = typename ArrowType::c_type; + + CType min_distance = 0, distance_neigh = 0; + + int count_n = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + distance_neigh = distance.distance(*it_neigh, i); + + if (distance_neigh <= eps_value) { + if (num_neighbors <= static_cast(m_leafsize)) { + ++count_n; + } else { + // process super-leaf values as one + count_n += num_neighbors; + break; + } + } else if (num_neighbors > static_cast(m_leafsize)) + // process super-leaf values as one + break; + } + + // use triangular inequality to prune branches + CType left_min_distance = distance_neigh - node->threshold; + + // epsilon enforces inequality for discrete distances + if (left_min_distance == 0 && distance_neigh == 1) left_min_distance += std::numeric_limits::epsilon(); + + if (node->left && left_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - distance_neigh; + + if (node->right && right_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + return count_n; +} + +} // namespace vptree diff --git a/pybnesian/vptree/vptree.hpp b/pybnesian/vptree/vptree.hpp new file mode 100644 index 00000000..5677567d --- /dev/null +++ b/pybnesian/vptree/vptree.hpp @@ -0,0 +1,148 @@ +#ifndef PYBNESIAN_VPTREE_HPP +#define PYBNESIAN_VPTREE_HPP + +#include +#include +#include +#include +#include + +using dataset::DataFrame; +using Eigen::Matrix, Eigen::VectorXd, Eigen::VectorXi; + +namespace vptree { + +template +std::vector hash_columns( + const std::vector::ArrayType>>& data, + std::vector column_names); + +template +class HybridChebyshevDistance { +public: + using CType = typename ArrowType::c_type; + using ArrayType = typename arrow::TypeTraits::ArrayType; + using OperationFunc = std::function; + + HybridChebyshevDistance(const std::vector>& data, + const std::vector& is_discrete_column) + : m_data(data) { + m_operations_coords.reserve(m_data.size()); + for (size_t i = 0; i < m_data.size(); ++i) { + if (is_discrete_column[i]) { + // For discrete columns, Hamming distance + m_operations_coords.push_back([this, i](size_t p1_index, size_t p2_index) -> CType { + return (m_data[i]->Value(p1_index) != m_data[i]->Value(p2_index)); + }); + } else { + // For continuous columns, Manhattan distance + m_operations_coords.push_back([this, i](size_t p1_index, size_t p2_index) -> CType { + return std::abs(m_data[i]->Value(p1_index) - m_data[i]->Value(p2_index)); + }); + } + } + } + + inline CType distance(size_t p1_index, size_t p2_index) const { + CType d = 0; + for (auto it_operation = m_operations_coords.begin(), it_end = m_operations_coords.end(); + it_operation != it_end; + ++it_operation) { + d = std::max(d, (*it_operation)(p1_index, p2_index)); + } + + return d; + } + + inline CType distance_coords(size_t p1_index, size_t p2_index, std::vector& coords) const { + CType d = 0; + for (auto it_col_idx = coords.begin(); it_col_idx != coords.end(); it_col_idx++) { + d = std::max(d, m_operations_coords[*it_col_idx](p1_index, p2_index)); + } + + return d; + } + +private: + const std::vector>& m_data; + std::vector m_operations_coords; +}; + +struct VPTreeNode { + size_t index; + double threshold; + std::unique_ptr left; + std::unique_ptr right; + std::vector leaf_indices; + bool is_leaf; +}; + +class VPTree { +public: + VPTree(DataFrame& df, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int leafsize = 16, + unsigned int seed = std::random_device{}()) + : m_df(df), + m_datatype(datatype), + m_is_discrete_column(is_discrete_column), + m_column_names(df.column_names()), + m_root(), + m_leafsize(leafsize), + m_seed(seed), + m_query_cache(), + m_count_cache(), + m_count_cache_unconditional() { + m_root = build_vptree(m_df, m_datatype, m_is_discrete_column, m_leafsize, m_seed); + } + + std::vector> query(const DataFrame& test_df, int k) const; + + std::tuple count_ball_subspaces(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const; + + VectorXi count_ball_unconditional(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const; + + const DataFrame& scaled_data() const { return m_df; } + +private: + std::unique_ptr build_vptree(const DataFrame& df, + const std::shared_ptr datatype, + const std::vector& is_discrete_column, + int leafsize, + unsigned int seed); + + template + std::pair query_instance(size_t i, + int k, + const HybridChebyshevDistance& distance) const; + + template + std::tuple count_ball_subspaces_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const; + + template + int count_ball_unconditional_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const; + + DataFrame& m_df; + std::shared_ptr m_datatype; + std::vector& m_is_discrete_column; + std::vector m_column_names; + std::unique_ptr m_root; + int m_leafsize; + unsigned int m_seed; + mutable std::unordered_map> m_query_cache; + mutable std::unordered_map> m_count_cache; + mutable std::unordered_map m_count_cache_unconditional; +}; + +} // namespace vptree + +#endif // PYBNESIAN_VPTREE_HPP \ No newline at end of file From fc5ca94fe88ac6f099ce04d78282e0d2b00cc02f Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 7 Mar 2025 14:32:21 +0100 Subject: [PATCH 48/75] Discrete Schurmann-Grassberger smoothing --- pybnesian/learning/parameters/mle_DiscreteFactor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index c6f13776..f9680e5e 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -30,8 +30,8 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, logprob(offset + i) = loguniform; } } else { - // Laplace Smoothing, lambda = 1 (uniform prior) - int lambda = 1; + // Schurmann-Grassberger smoothing, lambda = 1 (uniform prior) + double lambda = 1/cardinality(0); double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { logprob(offset + i) = std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; From 019cdbd4660b1950d35bc79f5e0ce1e5e697730e Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 7 Mar 2025 13:51:44 +0000 Subject: [PATCH 49/75] Refactor PR with ms-vscode.cpptools --- .../independences/hybrid/mixed_knncmi.cpp | 26 ++++++++++--------- .../independences/hybrid/mixed_knncmi.hpp | 18 ++++++------- .../parameters/mle_DiscreteFactor.cpp | 5 ++-- .../pybindings_independences.cpp | 2 +- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index f3a07fae..437575d9 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -206,7 +206,7 @@ int MixedKMutualInformation::find_minimum_cluster_size(const std::vector& discrete_vars) const { + const std::vector& discrete_vars) const { // hash the columns as they are no longer of type arrow::DictionaryArray std::unordered_map joint_counts; switch (m_datatype->id()) { @@ -331,7 +331,9 @@ double MixedKMutualInformation::mi(const std::string& x, const std::string& y, c return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); } -double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::vector& z) const { +double MixedKMutualInformation::mi(const std::string& x, + const std::string& y, + const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -511,8 +513,8 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& } double MixedKMutualInformation::pvalue(const std::string& x, - const std::string& y, - const std::vector& z) const { + const std::string& y, + const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -579,14 +581,14 @@ void shuffle_dataframe(const CType* original_x, } double MixedKMutualInformation::shuffled_pvalue(double original_mi, - int k, - int shuffle_neighbors, - DataFrame& x_df, - VPTree& ztree, - DataFrame& z_df, - DataFrame& shuffled_df, - std::vector& is_discrete_column, - std::vector& discrete_vars) const { + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const { std::minstd_rand rng{m_seed}; std::vector neighbors(m_df->num_rows()); diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp index 10346ea1..c15baf02 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -32,14 +32,14 @@ double mi_pair(VPTree& ytree, class MixedKMutualInformation : public IndependenceTest { public: MixedKMutualInformation(DataFrame df, - int k, - unsigned int seed = std::random_device{}(), - int shuffle_neighbors = 5, - int samples = 1000, - std::string scaling = "min_max", - bool gamma_approx = true, - bool adaptive_k = true, - int tree_leafsize = 16) + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) : m_df(df), m_scaled_df(scale_data(df, scaling)), m_datatype(), @@ -84,7 +84,7 @@ class MixedKMutualInformation : public IndependenceTest { int find_minimum_cluster_size(const std::vector& discrete_vars) const; int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, - const std::vector& discrete_vars) const; + const std::vector& discrete_vars) const; DataFrame m_df; DataFrame m_scaled_df; std::shared_ptr m_datatype; diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index f9680e5e..f7fa81bd 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -31,10 +31,11 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, } } else { // Schurmann-Grassberger smoothing, lambda = 1 (uniform prior) - double lambda = 1/cardinality(0); + double lambda = 1 / cardinality(0); double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { - logprob(offset + i) = std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; + logprob(offset + i) = + std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; } } } diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp index d9421676..68450aaf 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp @@ -528,7 +528,7 @@ Initializes a :class:`DynamicChiSquare` with the given :class:`DynamicDataFrame` :param ddf: :class:`DynamicDataFrame` to create the :class:`DynamicChiSquare`. )doc"); -py::class_>( + py::class_>( root, "MixedKMutualInformation", R"doc( This class implements a non-parametric independence test that is based on the estimation of the mutual information using k-nearest neighbors, accelerated using vantage-point trees (VP-Trees). This independence is implemented for a mix of categorical and continuous data. From a0d94d27b8d7f7707eebd12cd8c906cd77f35b83 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 7 Mar 2025 14:00:40 +0000 Subject: [PATCH 50/75] Revert "Refactor PR with ms-vscode.cpptools" This reverts commit 019cdbd4660b1950d35bc79f5e0ce1e5e697730e. --- .../independences/hybrid/mixed_knncmi.cpp | 26 +++++++++---------- .../independences/hybrid/mixed_knncmi.hpp | 18 ++++++------- .../parameters/mle_DiscreteFactor.cpp | 5 ++-- .../pybindings_independences.cpp | 2 +- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index 437575d9..f3a07fae 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -206,7 +206,7 @@ int MixedKMutualInformation::find_minimum_cluster_size(const std::vector& discrete_vars) const { + const std::vector& discrete_vars) const { // hash the columns as they are no longer of type arrow::DictionaryArray std::unordered_map joint_counts; switch (m_datatype->id()) { @@ -331,9 +331,7 @@ double MixedKMutualInformation::mi(const std::string& x, const std::string& y, c return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); } -double MixedKMutualInformation::mi(const std::string& x, - const std::string& y, - const std::vector& z) const { +double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -513,8 +511,8 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& } double MixedKMutualInformation::pvalue(const std::string& x, - const std::string& y, - const std::vector& z) const { + const std::string& y, + const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -581,14 +579,14 @@ void shuffle_dataframe(const CType* original_x, } double MixedKMutualInformation::shuffled_pvalue(double original_mi, - int k, - int shuffle_neighbors, - DataFrame& x_df, - VPTree& ztree, - DataFrame& z_df, - DataFrame& shuffled_df, - std::vector& is_discrete_column, - std::vector& discrete_vars) const { + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const { std::minstd_rand rng{m_seed}; std::vector neighbors(m_df->num_rows()); diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp index c15baf02..10346ea1 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -32,14 +32,14 @@ double mi_pair(VPTree& ytree, class MixedKMutualInformation : public IndependenceTest { public: MixedKMutualInformation(DataFrame df, - int k, - unsigned int seed = std::random_device{}(), - int shuffle_neighbors = 5, - int samples = 1000, - std::string scaling = "min_max", - bool gamma_approx = true, - bool adaptive_k = true, - int tree_leafsize = 16) + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) : m_df(df), m_scaled_df(scale_data(df, scaling)), m_datatype(), @@ -84,7 +84,7 @@ class MixedKMutualInformation : public IndependenceTest { int find_minimum_cluster_size(const std::vector& discrete_vars) const; int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, - const std::vector& discrete_vars) const; + const std::vector& discrete_vars) const; DataFrame m_df; DataFrame m_scaled_df; std::shared_ptr m_datatype; diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index f7fa81bd..f9680e5e 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -31,11 +31,10 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, } } else { // Schurmann-Grassberger smoothing, lambda = 1 (uniform prior) - double lambda = 1 / cardinality(0); + double lambda = 1/cardinality(0); double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { - logprob(offset + i) = - std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; + logprob(offset + i) = std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; } } } diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp index 68450aaf..d9421676 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp @@ -528,7 +528,7 @@ Initializes a :class:`DynamicChiSquare` with the given :class:`DynamicDataFrame` :param ddf: :class:`DynamicDataFrame` to create the :class:`DynamicChiSquare`. )doc"); - py::class_>( +py::class_>( root, "MixedKMutualInformation", R"doc( This class implements a non-parametric independence test that is based on the estimation of the mutual information using k-nearest neighbors, accelerated using vantage-point trees (VP-Trees). This independence is implemented for a mix of categorical and continuous data. From 4ba60b6d6df6772d8d11279e652e56c36fae242b Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 7 Mar 2025 14:01:04 +0000 Subject: [PATCH 51/75] Revert "Revert "Refactor PR with ms-vscode.cpptools"" This reverts commit a0d94d27b8d7f7707eebd12cd8c906cd77f35b83. --- .../independences/hybrid/mixed_knncmi.cpp | 26 ++++++++++--------- .../independences/hybrid/mixed_knncmi.hpp | 18 ++++++------- .../parameters/mle_DiscreteFactor.cpp | 5 ++-- .../pybindings_independences.cpp | 2 +- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index f3a07fae..437575d9 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -206,7 +206,7 @@ int MixedKMutualInformation::find_minimum_cluster_size(const std::vector& discrete_vars) const { + const std::vector& discrete_vars) const { // hash the columns as they are no longer of type arrow::DictionaryArray std::unordered_map joint_counts; switch (m_datatype->id()) { @@ -331,7 +331,9 @@ double MixedKMutualInformation::mi(const std::string& x, const std::string& y, c return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); } -double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::vector& z) const { +double MixedKMutualInformation::mi(const std::string& x, + const std::string& y, + const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -511,8 +513,8 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& } double MixedKMutualInformation::pvalue(const std::string& x, - const std::string& y, - const std::vector& z) const { + const std::string& y, + const std::vector& z) const { auto subset_df = m_scaled_df.loc(x, y, z); std::vector is_discrete_column; bool discrete_present = false; @@ -579,14 +581,14 @@ void shuffle_dataframe(const CType* original_x, } double MixedKMutualInformation::shuffled_pvalue(double original_mi, - int k, - int shuffle_neighbors, - DataFrame& x_df, - VPTree& ztree, - DataFrame& z_df, - DataFrame& shuffled_df, - std::vector& is_discrete_column, - std::vector& discrete_vars) const { + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const { std::minstd_rand rng{m_seed}; std::vector neighbors(m_df->num_rows()); diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp index 10346ea1..c15baf02 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -32,14 +32,14 @@ double mi_pair(VPTree& ytree, class MixedKMutualInformation : public IndependenceTest { public: MixedKMutualInformation(DataFrame df, - int k, - unsigned int seed = std::random_device{}(), - int shuffle_neighbors = 5, - int samples = 1000, - std::string scaling = "min_max", - bool gamma_approx = true, - bool adaptive_k = true, - int tree_leafsize = 16) + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) : m_df(df), m_scaled_df(scale_data(df, scaling)), m_datatype(), @@ -84,7 +84,7 @@ class MixedKMutualInformation : public IndependenceTest { int find_minimum_cluster_size(const std::vector& discrete_vars) const; int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, - const std::vector& discrete_vars) const; + const std::vector& discrete_vars) const; DataFrame m_df; DataFrame m_scaled_df; std::shared_ptr m_datatype; diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index f9680e5e..f7fa81bd 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -31,10 +31,11 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, } } else { // Schurmann-Grassberger smoothing, lambda = 1 (uniform prior) - double lambda = 1/cardinality(0); + double lambda = 1 / cardinality(0); double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { - logprob(offset + i) = std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; + logprob(offset + i) = + std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; } } } diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp index d9421676..68450aaf 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp @@ -528,7 +528,7 @@ Initializes a :class:`DynamicChiSquare` with the given :class:`DynamicDataFrame` :param ddf: :class:`DynamicDataFrame` to create the :class:`DynamicChiSquare`. )doc"); -py::class_>( + py::class_>( root, "MixedKMutualInformation", R"doc( This class implements a non-parametric independence test that is based on the estimation of the mutual information using k-nearest neighbors, accelerated using vantage-point trees (VP-Trees). This independence is implemented for a mix of categorical and continuous data. From af69f43bba276c7f7fe08a7b82e9749ce240976a Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 7 Mar 2025 16:26:08 +0100 Subject: [PATCH 52/75] Added new files to cmake --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b1e63b9..886f54d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,6 +159,7 @@ pybind11_add_module(__init__ "pybnesian/lib.cpp" "pybnesian/util/pickle.cpp" "pybnesian/util/util_types.cpp" "pybnesian/kdtree/kdtree.cpp" + "pybnesian/vptree/vptree.cpp" "pybnesian/learning/operators/operators.cpp" "pybnesian/learning/algorithms/hillclimbing.cpp" "pybnesian/learning/algorithms/pc.cpp" @@ -170,6 +171,7 @@ pybind11_add_module(__init__ "pybnesian/lib.cpp" "pybnesian/learning/independences/continuous/RCoT.cpp" "pybnesian/learning/independences/discrete/chi_square.cpp" "pybnesian/learning/independences/hybrid/mutual_information.cpp" + "pybnesian/learning/independences/hybrid/mixed_knncmi.cpp" "pybnesian/learning/parameters/mle_LinearGaussianCPD.cpp" "pybnesian/learning/parameters/mle_DiscreteFactor.cpp" "pybnesian/learning/scores/bic.cpp" From 5d2d8ab238cbcc731d9820c675be3ac0301d6e1f Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Sat, 8 Mar 2025 11:38:32 +0000 Subject: [PATCH 53/75] CMake Intellisense formatting --- CMakeLists.txt | 224 ++++++++++++++++++++++++------------------------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 886f54d6..85552886 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,72 +1,72 @@ cmake_minimum_required(VERSION 3.20.0) -IF(WIN32) +if(WIN32) set(VCPKG_TARGET_TRIPLET x64-windows-static) -ENDIF() +endif() -IF(APPLE) - SET(CMAKE_C_COMPILER "clang") - SET(CMAKE_CXX_COMPILER "clang++") -ENDIF() +if(APPLE) + set(CMAKE_C_COMPILER "clang") + set(CMAKE_CXX_COMPILER "clang++") +endif() -IF(UNIX) - SET(CMAKE_C_COMPILER "gcc") - SET(CMAKE_CXX_COMPILER "g++") -ENDIF() +if(UNIX) + set(CMAKE_C_COMPILER "gcc") + set(CMAKE_CXX_COMPILER "g++") +endif() find_package(Git REQUIRED) message("Git executable: ${GIT_EXECUTABLE}") -IF(EXISTS ".git") - SET(GIT_COMMAND_EXECUTED "{GIT_EXECUTABLE} submodule update --init --recursive") +if(EXISTS ".git") + set(GIT_COMMAND_EXECUTED "{GIT_EXECUTABLE} submodule update --init --recursive") execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE GIT_SUBMOD_RESULT) - IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") - ENDIF() -ELSE() - SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git") + endif() +else() + set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git") execute_process(COMMAND ${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git - WORKING_DIRECTORY "." RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY "." RESULT_VARIABLE GIT_SUBMOD_RESULT) - IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") - ENDIF() -ENDIF() + endif() +endif() -SET(GIT_COMMIT_HASH "2024.08.23") +set(GIT_COMMIT_HASH "2024.08.23") -SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") +set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") execute_process(COMMAND ${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH} - WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) -IF(NOT GIT_SUBMOD_RESULT EQUAL "0") +if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") -ENDIF() +endif() -SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} rev-parse HEAD") +set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} rev-parse HEAD") execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD - WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT OUTPUT_VARIABLE GIT_STDOUT) + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT OUTPUT_VARIABLE GIT_STDOUT) -IF(NOT GIT_SUBMOD_RESULT EQUAL "0") +if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") -ENDIF() +endif() message("Git commit in vcpkg: ${GIT_STDOUT}") set(CMAKE_TOOLCHAIN_FILE "vcpkg/scripts/buildsystems/vcpkg.cmake") project(pybnesian VERSION ${SKBUILD_PROJECT_VERSION} LANGUAGES CXX) -ADD_DEFINITIONS("-DVERSION_INFO=${SKBUILD_PROJECT_VERSION}") +add_definitions("-DVERSION_INFO=${SKBUILD_PROJECT_VERSION}") set(CMAKE_CXX_STANDARD 17) -IF(MSVC) - SET(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - ADD_DEFINITIONS("-DNOGDI") -ENDIF() +if(MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + add_definitions("-DNOGDI") +endif() set(PYBIND11_NEWPYTHON ON) find_package(Python COMPONENTS Interpreter Development) @@ -77,13 +77,13 @@ message("Minor version: ${Python_VERSION_MINOR}") add_definitions(-DPYTHON_VERSION_MAJOR=${Python_VERSION_MAJOR} -DPYTHON_VERSION_MINOR=${Python_VERSION_MINOR}) -IF(WIN32) - SET(SCRIPT_PREFIX "") - SET(SCRIPT_EXTENSION "bat") -ELSEIF(UNIX) - SET(SCRIPT_PREFIX "./") - SET(SCRIPT_EXTENSION "sh") -ENDIF() +if(WIN32) + set(SCRIPT_PREFIX "") + set(SCRIPT_EXTENSION "bat") +elseif(UNIX) + set(SCRIPT_PREFIX "./") + set(SCRIPT_EXTENSION "sh") +endif() # Find the Python interpreter find_package(PythonInterp 3 REQUIRED) @@ -91,21 +91,21 @@ find_package(PythonInterp 3 REQUIRED) # Use the found Python interpreter in the execute_process command execute_process(COMMAND ${PYTHON_EXECUTABLE} expand_sources.py RESULT_VARIABLE EXPAND_SOURCES_RESULT) -IF(NOT EXPAND_SOURCES_RESULT EQUAL "0") +if(NOT EXPAND_SOURCES_RESULT EQUAL "0") message(FATAL_ERROR "${PYTHON_EXECUTABLE} expand_sources.py failed with ${EXPAND_SOURCES_RESULT}") -ENDIF() +endif() execute_process(COMMAND ${SCRIPT_PREFIX}bootstrap-vcpkg.${SCRIPT_EXTENSION} WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE VCPKG_BOOTSTRAP_RESULT) -IF(NOT VCPKG_BOOTSTRAP_RESULT EQUAL "0") +if(NOT VCPKG_BOOTSTRAP_RESULT EQUAL "0") message(FATAL_ERROR "${SCRIPT_PREFIX}bootstrap-vcpkg.${SCRIPT_EXTENSION} failed with ${VCPKG_BOOTSTRAP_RESULT}") -ENDIF() +endif() execute_process(COMMAND ${SCRIPT_PREFIX}vcpkg install WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE VCPKG_INSTALL_RESULT) -IF(NOT VCPKG_INSTALL_RESULT EQUAL "0") +if(NOT VCPKG_INSTALL_RESULT EQUAL "0") message(FATAL_ERROR "${SCRIPT_PREFIX}vcpkg install failed with ${VCPKG_INSTALL_RESULT}") -ENDIF() +endif() find_package(Arrow CONFIG REQUIRED) message("Arrow found: ${Arrow_FOUND}") @@ -126,70 +126,70 @@ find_package(Boost REQUIRED COMPONENTS math dynamic_bitset) find_package(OpenCL REQUIRED) pybind11_add_module(__init__ "pybnesian/lib.cpp" - "pybnesian/pybindings/pybindings_dataset.cpp" - "pybnesian/pybindings/pybindings_kde.cpp" - "pybnesian/pybindings/pybindings_factors.cpp" - "pybnesian/pybindings/pybindings_graph.cpp" - "pybnesian/pybindings/pybindings_models.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_learning.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_scores.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_parameters.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_mle.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_operators.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp" - "pybnesian/kde/KDE.cpp" - "pybnesian/kde/ProductKDE.cpp" - "pybnesian/kde/UCV.cpp" - "pybnesian/factors/continuous/LinearGaussianCPD.cpp" - "pybnesian/factors/continuous/CKDE.cpp" - "pybnesian/factors/discrete/DiscreteFactor.cpp" - "pybnesian/factors/discrete/discrete_indices.cpp" - "pybnesian/dataset/dataset.cpp" - "pybnesian/dataset/dynamic_dataset.cpp" - "pybnesian/dataset/crossvalidation_adaptator.cpp" - "pybnesian/dataset/holdout_adaptator.cpp" - "pybnesian/util/arrow_types.cpp" - "pybnesian/util/bit_util.cpp" - "pybnesian/util/validate_options.cpp" - "pybnesian/util/validate_whitelists.cpp" - "pybnesian/util/temporal.cpp" - "pybnesian/util/rpoly.cpp" - "pybnesian/util/vech_ops.cpp" - "pybnesian/util/pickle.cpp" - "pybnesian/util/util_types.cpp" - "pybnesian/kdtree/kdtree.cpp" - "pybnesian/vptree/vptree.cpp" - "pybnesian/learning/operators/operators.cpp" - "pybnesian/learning/algorithms/hillclimbing.cpp" - "pybnesian/learning/algorithms/pc.cpp" - "pybnesian/learning/algorithms/mmpc.cpp" - "pybnesian/learning/algorithms/mmhc.cpp" - "pybnesian/learning/algorithms/dmmhc.cpp" - "pybnesian/learning/independences/continuous/linearcorrelation.cpp" - "pybnesian/learning/independences/continuous/mutual_information.cpp" - "pybnesian/learning/independences/continuous/RCoT.cpp" - "pybnesian/learning/independences/discrete/chi_square.cpp" - "pybnesian/learning/independences/hybrid/mutual_information.cpp" - "pybnesian/learning/independences/hybrid/mixed_knncmi.cpp" - "pybnesian/learning/parameters/mle_LinearGaussianCPD.cpp" - "pybnesian/learning/parameters/mle_DiscreteFactor.cpp" - "pybnesian/learning/scores/bic.cpp" - "pybnesian/learning/scores/bge.cpp" - "pybnesian/learning/scores/bde.cpp" - "pybnesian/learning/scores/cv_likelihood.cpp" - "pybnesian/learning/scores/holdout_likelihood.cpp" - "pybnesian/graph/generic_graph.cpp" - "pybnesian/models/BayesianNetwork.cpp" - "pybnesian/models/GaussianNetwork.cpp" - "pybnesian/models/SemiparametricBN.cpp" - "pybnesian/models/KDENetwork.cpp" - "pybnesian/models/DiscreteBN.cpp" - "pybnesian/models/HomogeneousBN.cpp" - "pybnesian/models/HeterogeneousBN.cpp" - "pybnesian/models/CLGNetwork.cpp" - "pybnesian/models/DynamicBayesianNetwork.cpp" - "pybnesian/opencl/opencl_config.cpp") + "pybnesian/pybindings/pybindings_dataset.cpp" + "pybnesian/pybindings/pybindings_kde.cpp" + "pybnesian/pybindings/pybindings_factors.cpp" + "pybnesian/pybindings/pybindings_graph.cpp" + "pybnesian/pybindings/pybindings_models.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_learning.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_scores.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_parameters.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_mle.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_operators.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp" + "pybnesian/kde/KDE.cpp" + "pybnesian/kde/ProductKDE.cpp" + "pybnesian/kde/UCV.cpp" + "pybnesian/factors/continuous/LinearGaussianCPD.cpp" + "pybnesian/factors/continuous/CKDE.cpp" + "pybnesian/factors/discrete/DiscreteFactor.cpp" + "pybnesian/factors/discrete/discrete_indices.cpp" + "pybnesian/dataset/dataset.cpp" + "pybnesian/dataset/dynamic_dataset.cpp" + "pybnesian/dataset/crossvalidation_adaptator.cpp" + "pybnesian/dataset/holdout_adaptator.cpp" + "pybnesian/util/arrow_types.cpp" + "pybnesian/util/bit_util.cpp" + "pybnesian/util/validate_options.cpp" + "pybnesian/util/validate_whitelists.cpp" + "pybnesian/util/temporal.cpp" + "pybnesian/util/rpoly.cpp" + "pybnesian/util/vech_ops.cpp" + "pybnesian/util/pickle.cpp" + "pybnesian/util/util_types.cpp" + "pybnesian/kdtree/kdtree.cpp" + "pybnesian/vptree/vptree.cpp" + "pybnesian/learning/operators/operators.cpp" + "pybnesian/learning/algorithms/hillclimbing.cpp" + "pybnesian/learning/algorithms/pc.cpp" + "pybnesian/learning/algorithms/mmpc.cpp" + "pybnesian/learning/algorithms/mmhc.cpp" + "pybnesian/learning/algorithms/dmmhc.cpp" + "pybnesian/learning/independences/continuous/linearcorrelation.cpp" + "pybnesian/learning/independences/continuous/mutual_information.cpp" + "pybnesian/learning/independences/continuous/RCoT.cpp" + "pybnesian/learning/independences/discrete/chi_square.cpp" + "pybnesian/learning/independences/hybrid/mutual_information.cpp" + "pybnesian/learning/independences/hybrid/mixed_knncmi.cpp" + "pybnesian/learning/parameters/mle_LinearGaussianCPD.cpp" + "pybnesian/learning/parameters/mle_DiscreteFactor.cpp" + "pybnesian/learning/scores/bic.cpp" + "pybnesian/learning/scores/bge.cpp" + "pybnesian/learning/scores/bde.cpp" + "pybnesian/learning/scores/cv_likelihood.cpp" + "pybnesian/learning/scores/holdout_likelihood.cpp" + "pybnesian/graph/generic_graph.cpp" + "pybnesian/models/BayesianNetwork.cpp" + "pybnesian/models/GaussianNetwork.cpp" + "pybnesian/models/SemiparametricBN.cpp" + "pybnesian/models/KDENetwork.cpp" + "pybnesian/models/DiscreteBN.cpp" + "pybnesian/models/HomogeneousBN.cpp" + "pybnesian/models/HeterogeneousBN.cpp" + "pybnesian/models/CLGNetwork.cpp" + "pybnesian/models/DynamicBayesianNetwork.cpp" + "pybnesian/opencl/opencl_config.cpp") target_include_directories(__init__ PRIVATE "pybnesian") From b9e708ef9a2a93f53dd609faf1253955038e225c Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Sat, 8 Mar 2025 11:42:03 +0000 Subject: [PATCH 54/75] Fix JSON formatting --- vcpkg-configuration.json | 10 +++++----- vcpkg.json | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json index 9439b837..b3e14c03 100644 --- a/vcpkg-configuration.json +++ b/vcpkg-configuration.json @@ -1,6 +1,6 @@ { - "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json", - "overlay-ports": [ - "./overlay_ports" - ] - } \ No newline at end of file + "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json", + "overlay-ports": [ + "./overlay_ports" + ] +} \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json index 202301b1..e25cca37 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -9,4 +9,4 @@ "boost-dynamic-bitset", "opencl" ] -} +} \ No newline at end of file From 05259a59890b07d974162e09ee9fad2f75133e09 Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Mon, 10 Mar 2025 12:16:04 +0100 Subject: [PATCH 55/75] Numerical fix MixedKCMI --- .../independences/hybrid/mixed_knncmi.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index 437575d9..e3044301 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -197,7 +197,7 @@ int MixedKMutualInformation::find_minimum_cluster_size(const std::vector 0 && joint_counts[i] < min_cluster_size) { + if (joint_counts[i] > 1 && joint_counts[i] < min_cluster_size) { min_cluster_size = joint_counts[i]; } } @@ -230,7 +230,7 @@ int MixedKMutualInformation::find_minimum_shuffled_cluster_size(const DataFrame& // find minimum positive cluster size for (const auto& [config, count] : joint_counts) { - if (count < min_cluster_size) { + if (count > 1 && count < min_cluster_size) { min_cluster_size = count; } } @@ -302,7 +302,7 @@ double MixedKMutualInformation::mi(const std::string& x, const std::string& y) c if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); @@ -321,7 +321,7 @@ double MixedKMutualInformation::mi(const std::string& x, const std::string& y, c if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); @@ -342,7 +342,7 @@ double MixedKMutualInformation::mi(const std::string& x, if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto z_df = m_scaled_df.loc(z); @@ -430,7 +430,7 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& // the adaptive k affects both the CMI estimates and the shuffling if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); @@ -453,7 +453,7 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& // we compute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto shuffled_value = mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); @@ -471,7 +471,7 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& // we compute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto shuffled_value = mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); @@ -493,8 +493,8 @@ double MixedKMutualInformation::pvalue(const std::string& x, const std::string& if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); - shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size); + k = std::min(k, min_cluster_size - 1); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size - 1); } auto x_df = subset_df.loc(0); @@ -525,8 +525,8 @@ double MixedKMutualInformation::pvalue(const std::string& x, // the adaptive k affects both the CMI estimates and the shuffling if (discrete_present && m_adaptive_k) { auto min_cluster_size = find_minimum_cluster_size(discrete_vars); - k = std::min(k, min_cluster_size); - shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size); + k = std::min(k, min_cluster_size - 1); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size - 1); } auto x_df = subset_df.loc(0); @@ -615,7 +615,7 @@ double MixedKMutualInformation::shuffled_pvalue(double original_mi, // we compute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto shuffled_value = @@ -638,7 +638,7 @@ double MixedKMutualInformation::shuffled_pvalue(double original_mi, // we compute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); - k = std::min(k, min_cluster_size); + k = std::min(k, min_cluster_size - 1); } auto shuffled_value = From 38cb0916bbc72f420fc192ffd9c38f9f5cd9c6b8 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 11 Mar 2025 10:18:31 +0000 Subject: [PATCH 56/75] Add detailed installation guide for PyBNesian in INSTALLATION.md --- INSTALLATION.md | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + 2 files changed, 109 insertions(+) create mode 100644 INSTALLATION.md diff --git a/INSTALLATION.md b/INSTALLATION.md new file mode 100644 index 00000000..f86cdf54 --- /dev/null +++ b/INSTALLATION.md @@ -0,0 +1,108 @@ +# Installing PyBNesian +Here you can find a detailed installation guide to use PyBNesian including the installation of C++ and GPU tools. + +We acknowledge all the members from Computational Intelligence Group (UPM) for +further discussions related to the installation procedure. + +### Contents +1. [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) +2. [Windows](#windows) +3. [Installation issues](#installation-issues) + +## Ubuntu and Linux sub-systems +PyBNesian uses C++ and OpenCL in the backend to speed up certain computations. +Thus, some software is required to ensure everything works. +Note that, although setting up a Conda environment is usually recommended, it is not mandatory. +The following commands ensure that the C++ and OpenCL requirements are satisfied. + +```bash +sudo apt update +sudo apt install cmake +sudo apt install g++ +sudo apt install opencl-headers +sudo apt install ocl-icd-opencl-dev +``` + +After the previous steps you should be able to install PyBNesian and its dependencies. + +### Installing from source +To install from source, we will download git to be able to download the +repository from GitHub. +```bash +sudo apt install git +``` + +Now, clone the repository, install its dependencies, and install the package. + +```bash +git clone https://github.com/carloslihu/PyBNesian.git +cd PyBNesian +pip install . +``` + +### Installing directly from PyPi +Before installing PyBNesian, ensure that all the dependencies are already installed in your Python environment. + +```bash +pip install PyBNesian +``` + +If no errors were raised, then the software is ready to be used. Otherwise, please +restart the process or raise an issue in the repository. + +## Windows +Sometimes, in order to reduce possible inconvenient regarding Windows OS, +a Linux sub-system is installed (https://learn.microsoft.com/es-es/windows/wsl/install). +If this was the case, please go to [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) section. +Otherwise, please follow the next steps. + +1. Download Visual Studio 2022 from https://visualstudio.microsoft.com/es/vs/ + + 1.1. Download the requirements for C++ +3. Download Visual Studio Build Tools 2022. + +```bash +winget install "Visual Studio Build Tools 2022" +``` + +3. Download developer tools for GPU. + + 3.1. For Nvidia, download Nvidia Toolkit (https://developer.nvidia.com/cuda-downloads) + + 3.2. For Intel, download OneApi (https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html) + +5. Download OpenCL for windows. This guide explains the installation process: https://windowsreport.com/opencl-install-windows-11/ + +6. Install PyBNesian + +### Installing from source +To install from source, we will download git to be able to download the +repository from GitHub. +```bash +sudo apt install git +``` + +Now, clone the repository, install its dependencies, and install the package. + +```bash +git clone https://github.com/carloslihu/PyBNesian.git +cd PyBNesian +pip install . +``` + +### Installing directly from PyPi +Before installing PyBNesian, ensure that all the dependencies are already installed in your Python environment. + +```bash +pip install PyBNesian +``` + +If no errors were raised, then the software is ready to be used. +Otherwise, please restart the process or raise an issue in the repository. + +## Installation issues + +1. If default [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) installation +fails, there might be necessary to install GPU toolkits for Linux. +Please, visit https://developer.nvidia.com/cuda-downloads for Nvidia, and +https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html for Intel. \ No newline at end of file diff --git a/README.md b/README.md index eba05f6e..051c6efc 100644 --- a/README.md +++ b/README.md @@ -296,6 +296,7 @@ Prerequisites - Git. - OpenCL drivers installed. +We provide a detailed [installation guide](INSTALLATION.md) for these prerequisites of PyBNesian. Building -------- From 0276ebaf1db18d66ca6a7be22e9f2cd607ee5471 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 12 Mar 2025 11:16:12 +0000 Subject: [PATCH 57/75] Update pytest.ini to exclude 'vcpkg' directory from test discovery --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index b9c99fa9..9ee7c354 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] testpaths = tests -norecursedirs = tests/helpers +norecursedirs = tests/helpers vcpkg addopts = -s \ No newline at end of file From f68999a79d91009b6efa6f2da163ff0a131e656c Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 14 Mar 2025 12:26:10 +0000 Subject: [PATCH 58/75] Refactor test imports to use 'helpers' directory for data generation functions --- tests/conftest.py | 2 +- tests/dataset/crossvalidation_test.py | 3 +-- tests/dataset/holdout_test.py | 3 +-- tests/factors/continuous/CKDE_test.py | 3 +-- tests/factors/continuous/KDE_test.py | 3 +-- tests/factors/continuous/LinearGaussianCPD_test.py | 3 +-- tests/factors/continuous/ProductKDE_test.py | 3 +-- tests/factors/discrete/DiscreteFactor_test.py | 3 +-- tests/learning/algorithms/hillclimbing_test.py | 3 +-- tests/learning/independence_tests/independence_test.py | 2 +- tests/learning/operators/operatorpool_test.py | 3 +-- tests/learning/operators/operatorset_test.py | 3 +-- tests/learning/parameters/mle_test.py | 3 +-- tests/learning/scores/bic_test.py | 3 +-- tests/learning/scores/cvlikelihood_test.py | 3 +-- tests/learning/scores/holdoutlikelihood_test.py | 3 +-- tests/models/BayesianNetwork_test.py | 3 +-- tests/models/BayesianNetwork_type_test.py | 3 +-- tests/models/DynamicBayesianNetwork_test.py | 3 +-- tests/models/SemiparametricBN_test.py | 3 +-- tests/serialization/serialize_models_test.py | 3 +-- 21 files changed, 21 insertions(+), 40 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 556b6c7c..8282ce59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,4 +2,4 @@ import os import sys -sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "tests"))) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index ee42fd84..7287bf73 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,7 +1,6 @@ import numpy as np import pybnesian as pbn - -from data import generate_normal_data +from helpers.data import generate_normal_data SIZE = 10000 diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 546fae38..6486af67 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,8 +1,7 @@ import numpy as np import pandas as pd import pybnesian as pbn - -from data import generate_normal_data +from helpers.data import generate_normal_data SIZE = 10000 diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index ef9281a8..0e2b273e 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -3,12 +3,11 @@ import pyarrow as pa import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from scipy.stats import gaussian_kde from scipy.stats import multivariate_normal as mvn from scipy.stats import norm -from data import generate_normal_data - SIZE = 10000 SMALL_SIZE = 10 TEST_SIZE = 50 diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index c7439543..e2e89de4 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -2,11 +2,10 @@ import pyarrow as pa import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde -from data import generate_normal_data - SIZE = 500 df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 65e8c1df..2a4ad1a3 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -2,10 +2,9 @@ import pandas as pd import pyarrow as pa import pybnesian as pbn +from helpers.data import generate_normal_data from scipy.stats import norm -from data import generate_normal_data - SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 766317ff..505c0caa 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -2,11 +2,10 @@ import pyarrow as pa import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde -from data import generate_normal_data - SIZE = 500 df = generate_normal_data(SIZE, seed=0) df_float = df.astype("float32") diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index a1c6bac6..c2b32566 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -3,8 +3,7 @@ import pyarrow as pa import pybnesian as pbn import pytest - -from data import generate_discrete_data +from helpers.data import generate_discrete_data df = generate_discrete_data(10000) diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index d554e225..5965065f 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,9 +1,8 @@ import numpy as np import pybnesian as pbn +from helpers.data import generate_normal_data from pybnesian import BayesianNetwork, BayesianNetworkType -from data import generate_normal_data - df = generate_normal_data(1000) # TODO: Add tests for normal data with dependencies # dep_df = generate_normal_data_dep(1000) diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 5b4cb97c..4486d247 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from data import ( +from helpers.data import ( DATA_SIZE, N_NEIGHBORS, SEED, diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index c6febebd..49f365d6 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,7 +1,6 @@ import pybnesian as pbn import pytest - -from data import generate_normal_data +from helpers.data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index 9d0d600f..b165fa1a 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,8 +1,7 @@ import numpy as np import pybnesian as pbn import pytest - -from data import generate_normal_data +from helpers.data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index fb672f1d..fb33e182 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,8 +1,7 @@ import numpy as np import pybnesian as pbn import pytest - -from data import generate_normal_data +from helpers.data import generate_normal_data SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 44196a20..8a415452 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,9 +1,8 @@ import numpy as np import pybnesian as pbn +from helpers.data import generate_normal_data from scipy.stats import norm -from data import generate_normal_data - SIZE = 10000 df = generate_normal_data(SIZE) diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index e3b25c28..4164b836 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -2,10 +2,9 @@ import pandas as pd import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from scipy.stats import gaussian_kde, norm -from data import generate_normal_data - SIZE = 1000 df = generate_normal_data(SIZE) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 08d92319..b795898f 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -2,10 +2,9 @@ import pandas as pd import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from scipy.stats import gaussian_kde, norm -from data import generate_normal_data - SIZE = 1000 df = generate_normal_data(SIZE) seed = 0 diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index 1420850c..383be717 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,10 +1,9 @@ import numpy as np import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from pybnesian import BayesianNetwork, GaussianNetwork -from data import generate_normal_data - df = generate_normal_data(10000) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 5d8bc01a..32451f79 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,4 +1,5 @@ import pybnesian as pbn +from helpers.data import generate_normal_data_independent from pybnesian import ( BayesianNetwork, BayesianNetworkType, @@ -9,8 +10,6 @@ SemiparametricBN, ) -from data import generate_normal_data_independent - def test_bn_type(): g1 = GaussianNetwork(["A", "B", "C", "D"]) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 9c849b1b..c41b39b6 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -4,6 +4,7 @@ import pandas as pd import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from pybnesian import ( ConditionalGaussianNetwork, DynamicGaussianNetwork, @@ -11,8 +12,6 @@ ) from scipy.stats import norm -from data import generate_normal_data - df = generate_normal_data(1000) diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 0045d560..b33e69af 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,10 +1,9 @@ import numpy as np import pybnesian as pbn import pytest +from helpers.data import generate_normal_data from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN -from data import generate_normal_data - df = generate_normal_data(10000) diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index 2f39d9ce..f7c50949 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -3,6 +3,7 @@ import pyarrow as pa import pybnesian as pbn import pytest +from helpers.data import generate_discrete_data, generate_normal_data_independent from pybnesian import ( CKDE, BayesianNetwork, @@ -16,8 +17,6 @@ SemiparametricBN, ) -from data import generate_discrete_data, generate_normal_data_independent - @pytest.fixture def gaussian_bytes(): From 9ce0770ba03c768f2bc33ce556c4a94f804af2c8 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 19 Mar 2025 16:29:14 +0000 Subject: [PATCH 59/75] Update data generation functions: rename TRUE_LABEL and add generate_non_normal_data_classification --- tests/helpers/data.py | 73 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index 7aa0063e..900a4f9b 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -1,10 +1,14 @@ import numpy as np import pandas as pd -TRUE_LABEL = "class_label" +# Constants +TRUE_LABEL = "attack_label" +SUPER_PARENT = "A" DATA_SIZE = 10000 -SEED = 0 +SAMPLE_SIZE = 100 + N_NEIGHBORS = 3 +SEED = 0 def generate_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: @@ -433,3 +437,68 @@ def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataF } ) return df + + +def generate_non_normal_data_classification( + size: int, seed: int = SEED +) -> pd.DataFrame: + """Generates a DataFrame of uniformly distributed data with non-linear relationships and a true label. + The relationships are as follows: + - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ U(0, 10) + - B ~ U(5, 15) if class = class1, else U(10, 20) if class = class2, else U(15, 25) if class = class3 + - C ~ sin(A) + cos(B) + U(-1, 1) if class = class1, else exp(A / 10) + log(B + 1) + U(-0.5, 0.5) if class = class2, else A * B + U(-2, 2) if class = class3 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + class_dict = np.asarray(["class1", "class2", "class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_values = np.random.uniform(0, 10, size) + + b_values = np.empty_like(a_values) + c_values = np.empty_like(a_values) + + # Indices + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" + + # Sampling + b_values[class1_indices] = np.random.uniform(5, 15, size=class1_indices.sum()) + b_values[class2_indices] = np.random.uniform(10, 20, size=class2_indices.sum()) + b_values[class3_indices] = np.random.uniform(15, 25, size=class3_indices.sum()) + + c_values[class1_indices] = ( + np.sin(a_values[class1_indices]) + + np.cos(b_values[class1_indices]) + + np.random.uniform(-1, 1, size=class1_indices.sum()) + ) + c_values[class2_indices] = ( + np.exp(a_values[class2_indices] / 10) + + np.log(b_values[class2_indices] + 1) + + np.random.uniform(-0.5, 0.5, size=class2_indices.sum()) + ) + c_values[class3_indices] = a_values[class3_indices] * b_values[ + class3_indices + ] + np.random.uniform(-2, 2, size=class3_indices.sum()) + + # DataFrame + df = pd.DataFrame( + { + TRUE_LABEL: pd.Series(class_values, dtype="category"), + "A": a_values, + "B": b_values, + "C": c_values, + } + ) + return df From da3018bd144650858f1d9edab94a0386e2991332 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 25 Mar 2025 11:57:59 +0000 Subject: [PATCH 60/75] Refactor tests with unified DATA_SIZE constant --- tests/dataset/crossvalidation_test.py | 22 ++++++++--------- tests/dataset/holdout_test.py | 24 +++++++++---------- tests/factors/continuous/CKDE_test.py | 13 +++++----- .../continuous/LinearGaussianCPD_test.py | 14 +++++------ tests/factors/discrete/DiscreteFactor_test.py | 4 ++-- tests/learning/operators/operatorpool_test.py | 5 ++-- tests/learning/operators/operatorset_test.py | 5 ++-- tests/learning/parameters/mle_test.py | 5 ++-- tests/learning/scores/bic_test.py | 14 +++++------ tests/models/BayesianNetwork_test.py | 4 ++-- tests/models/SemiparametricBN_test.py | 4 ++-- 11 files changed, 51 insertions(+), 63 deletions(-) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 7287bf73..7bae2954 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,10 +1,8 @@ import numpy as np import pybnesian as pbn -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 - -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def test_cv_disjoint_indices(): @@ -16,7 +14,7 @@ def test_cv_disjoint_indices(): combination = np.hstack((nptrain, nptest)) assert np.all( - np.sort(combination) == np.arange(SIZE) + np.sort(combination) == np.arange(DATA_SIZE) ), "Not all the examples are included in the cross validation." assert np.all( train_df.to_pandas().to_numpy() == df.iloc[train_indices, :].to_numpy() @@ -168,10 +166,10 @@ def test_cv_loc(): def test_cv_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df df_null.loc[df_null.index[a_null], "A"] = np.nan @@ -193,7 +191,7 @@ def test_cv_null(): actual_combination = np.sort( np.setdiff1d( - np.arange(SIZE), + np.arange(DATA_SIZE), np.asarray( list(set(list(a_null) + list(b_null) + list(c_null) + list(d_null))) ), @@ -230,7 +228,7 @@ def test_cv_null(): ): assert ( train_df.num_rows + test_df.num_rows - ) == SIZE, "CV did not remove null instances correctly." + ) == DATA_SIZE, "CV did not remove null instances correctly." nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) @@ -242,7 +240,7 @@ def test_cv_null(): test_indices_mat = df.iloc[test_indices, :].to_numpy() assert np.all( - np.sort(combination) == np.arange(SIZE) + np.sort(combination) == np.arange(DATA_SIZE) ), "Not all the examples are included in the cross validation." assert np.all( np.isnan(train_df_mat) == np.isnan(train_indices_mat) diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 6486af67..4cf9d373 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,11 +1,9 @@ import numpy as np import pandas as pd import pybnesian as pbn -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 - -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def test_holdout_disjoint(): @@ -15,7 +13,7 @@ def test_holdout_disjoint(): assert ( train_df.num_rows + test_df.num_rows - ) == SIZE, "HoldOut do not have the expected number of rows" + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" assert train_df.num_rows == round( (1 - 0.2) * df.shape[0] @@ -37,7 +35,7 @@ def test_holdout_disjoint(): assert ( train_df.num_rows + test_df.num_rows - ) == SIZE, "HoldOut do not have the expected number of rows" + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" assert train_df.num_rows == round( (1 - 0.3) * df.shape[0] @@ -82,10 +80,10 @@ def test_holdout_seed(): def test_holdout_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df df_null.loc[df_null.index[a_null], "A"] = np.nan @@ -120,12 +118,12 @@ def test_holdout_null(): train_df, test_df = hold_null.training_data(), hold_null.test_data() assert ( train_df.num_rows + test_df.num_rows - ) == SIZE, "HoldOut do not have the expected number of rows" + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" assert train_df.num_rows == round( - (1 - 0.2) * SIZE + (1 - 0.2) * DATA_SIZE ), "Train DataFrame do not have the expected number of instances" assert test_df.num_rows == round( - 0.2 * SIZE + 0.2 * DATA_SIZE ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index 0e2b273e..afcc66ba 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -3,15 +3,14 @@ import pyarrow as pa import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data from scipy.stats import gaussian_kde from scipy.stats import multivariate_normal as mvn from scipy.stats import norm -SIZE = 10000 SMALL_SIZE = 10 TEST_SIZE = 50 -df = generate_normal_data(SIZE, seed=0) +df = generate_normal_data(DATA_SIZE, seed=0) df_small = generate_normal_data(SMALL_SIZE, seed=0) df_float = df.astype("float32") df_small_float = df_small.astype("float32") @@ -164,10 +163,10 @@ def _test_ckde_fit_null(variable, evidence, variables, _df, instances): assert cpd.num_instances() == scipy_kde.n np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], "A"] = np.nan diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 2a4ad1a3..56b85a20 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -2,12 +2,10 @@ import pandas as pd import pyarrow as pa import pybnesian as pbn -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data from scipy.stats import norm -SIZE = 10000 - -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def test_lg_variable(): @@ -67,10 +65,10 @@ def test_lg_fit(): def test_lg_fit_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], "A"] = np.nan diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index c2b32566..73434ea9 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -3,9 +3,9 @@ import pyarrow as pa import pybnesian as pbn import pytest -from helpers.data import generate_discrete_data +from helpers.data import DATA_SIZE, generate_discrete_data -df = generate_discrete_data(10000) +df = generate_discrete_data(DATA_SIZE) def test_data_type(): diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 49f365d6..c2184689 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,9 +1,8 @@ import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def test_create(): diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index b165fa1a..5deb682a 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,10 +1,9 @@ import numpy as np import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def test_create_change_node(): diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index fb33e182..b676e813 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,10 +1,9 @@ import numpy as np import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def numpy_fit_mle_lg(data, variable, evidence): diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 8a415452..49f9d689 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,11 +1,9 @@ import numpy as np import pybnesian as pbn -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data from scipy.stats import norm -SIZE = 10000 - -df = generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def numpy_local_score(data, variable, evidence): @@ -69,10 +67,10 @@ def test_bic_local_score_null(): ) np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], "A"] = np.nan diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index 383be717..14ce8d04 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,10 +1,10 @@ import numpy as np import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data from pybnesian import BayesianNetwork, GaussianNetwork -df = generate_normal_data(10000) +df = generate_normal_data(DATA_SIZE) def test_create_bn(): diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index b33e69af..7b9cc53b 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,10 +1,10 @@ import numpy as np import pybnesian as pbn import pytest -from helpers.data import generate_normal_data +from helpers.data import DATA_SIZE, generate_normal_data from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN -df = generate_normal_data(10000) +df = generate_normal_data(DATA_SIZE) def test_create_spbn(): From 07524fe2b61d78c5c8d11907b17d1cb49c126129 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 26 Mar 2025 14:04:56 +0000 Subject: [PATCH 61/75] Refactor imports to use 'pbn' namespace for consistency across tests --- tests/factors/continuous/KDE_test.py | 5 +- tests/factors/continuous/ProductKDE_test.py | 5 +- tests/factors/factor_type_test.py | 27 ++-- tests/learning/algorithms/constraint_test.py | 28 ++-- .../learning/algorithms/hillclimbing_test.py | 109 ++++++++-------- .../independence_tests/independence_test.py | 32 ++--- tests/models/BayesianNetwork_test.py | 93 ++++++++++---- tests/models/BayesianNetwork_type_test.py | 61 ++++----- tests/models/DynamicBayesianNetwork_test.py | 35 +++-- tests/models/SemiparametricBN_test.py | 53 ++++---- tests/serialization/serialize_factor_test.py | 31 ++--- .../serialize_factor_type_test.py | 12 +- tests/serialization/serialize_models_test.py | 120 +++++++++--------- 13 files changed, 311 insertions(+), 300 deletions(-) diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index e2e89de4..dec89299 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -3,7 +3,6 @@ import pybnesian as pbn import pytest from helpers.data import generate_normal_data -from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde SIZE = 500 @@ -81,9 +80,9 @@ def test_kde_bandwidth(): assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." -class UnitaryBandwidth(BandwidthSelector): +class UnitaryBandwidth(pbn.BandwidthSelector): def __init__(self): - BandwidthSelector.__init__(self) + pbn.BandwidthSelector.__init__(self) def bandwidth(self, df, variables): return np.eye(len(variables)) diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 505c0caa..f8cc6fee 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -3,7 +3,6 @@ import pybnesian as pbn import pytest from helpers.data import generate_normal_data -from pybnesian import BandwidthSelector from scipy.stats import gaussian_kde SIZE = 500 @@ -104,9 +103,9 @@ def test_productkde_bandwidth(): assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." -class UnitaryBandwidth(BandwidthSelector): +class UnitaryBandwidth(pbn.BandwidthSelector): def __init__(self): - BandwidthSelector.__init__(self) + pbn.BandwidthSelector.__init__(self) def diag_bandwidth(self, df, variables): return np.ones((len(variables),)) diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index 3a5905d4..54ca22ec 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -1,6 +1,5 @@ import pybnesian as pbn import pytest -from pybnesian import Factor, FactorType def test_factor_type(): @@ -37,9 +36,9 @@ def test_factor_type(): def test_new_factor_type(): - class A(FactorType): + class A(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) a1 = A() a2 = A() @@ -49,9 +48,9 @@ def __init__(self): assert a1 == a3 assert a2 == a3 - class B(FactorType): + class B(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) b1 = B() b2 = B() @@ -65,16 +64,16 @@ def __init__(self): def test_factor_defined_factor_type(): - class F_type(FactorType): + class F_type(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) def __str__(self): return "FType" - class F(Factor): + class F(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) def type(self): return F_type() @@ -92,13 +91,13 @@ def type(self): dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) with pytest.raises(RuntimeError) as ex: f1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) - assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( + assert 'Tried to call pure virtual function "pbn.FactorType::new_factor"' in str( ex.value ) - class G_type(FactorType): + class G_type(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) def new_factor(self, model, variable, evidence): return G(variable, evidence) @@ -106,9 +105,9 @@ def new_factor(self, model, variable, evidence): def __str__(self): return "GType" - class G(Factor): + class G(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) def type(self): return G_type() diff --git a/tests/learning/algorithms/constraint_test.py b/tests/learning/algorithms/constraint_test.py index 34398c5a..56edf443 100644 --- a/tests/learning/algorithms/constraint_test.py +++ b/tests/learning/algorithms/constraint_test.py @@ -1,46 +1,46 @@ -from pybnesian import MeekRules, PartiallyDirectedGraph +import pybnesian as pbn def test_meek_rule1(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr1 = PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y")], [("Y", "Z")]) + gr1 = pbn.PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y")], [("Y", "Z")]) - assert MeekRules.rule1(gr1) + assert pbn.MeekRules.rule1(gr1) assert gr1.num_edges() == 0 assert set(gr1.arcs()) == set([("X", "Y"), ("Y", "Z")]) - assert not MeekRules.rule1(gr1) + assert not pbn.MeekRules.rule1(gr1) def test_meek_rule2(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr2 = PartiallyDirectedGraph( + gr2 = pbn.PartiallyDirectedGraph( ["X", "Y", "Z"], [("X", "Y"), ("Y", "Z")], [("X", "Z")] ) - assert MeekRules.rule2(gr2) + assert pbn.MeekRules.rule2(gr2) assert gr2.num_edges() == 0 assert set(gr2.arcs()) == set([("X", "Y"), ("Y", "Z"), ("X", "Z")]) - assert not MeekRules.rule2(gr2) + assert not pbn.MeekRules.rule2(gr2) def test_meek_rule3(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr3 = PartiallyDirectedGraph( + gr3 = pbn.PartiallyDirectedGraph( ["X", "Y1", "Y2", "Z"], [("Y1", "Z"), ("Y2", "Z")], [("X", "Y1"), ("X", "Y2"), ("X", "Z")], ) - assert MeekRules.rule3(gr3) + assert pbn.MeekRules.rule3(gr3) assert set(gr3.edges()) == set([("X", "Y1"), ("X", "Y2")]) assert set(gr3.arcs()) == set([("X", "Z"), ("Y1", "Z"), ("Y2", "Z")]) - assert not MeekRules.rule3(gr3) + assert not pbn.MeekRules.rule3(gr3) def test_meek_sequential(): # From Koller Chapter 3.4, Figure 3.13, pag 90. - koller = PartiallyDirectedGraph( + koller = pbn.PartiallyDirectedGraph( ["A", "B", "C", "D", "E", "F", "G"], [("B", "E"), ("C", "E")], [("A", "B"), ("B", "D"), ("C", "F"), ("E", "F"), ("F", "G")], @@ -48,9 +48,9 @@ def test_meek_sequential(): changed = True while changed: changed = False - changed = changed or MeekRules.rule1(koller) - changed = changed or MeekRules.rule2(koller) - changed = changed or MeekRules.rule3(koller) + changed = changed or pbn.MeekRules.rule1(koller) + changed = changed or pbn.MeekRules.rule2(koller) + changed = changed or pbn.MeekRules.rule3(koller) assert set(koller.edges()) == set([("A", "B"), ("B", "D")]) assert set(koller.arcs()) == set( diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 5965065f..902adfad 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,13 +1,54 @@ import numpy as np import pybnesian as pbn from helpers.data import generate_normal_data -from pybnesian import BayesianNetwork, BayesianNetworkType df = generate_normal_data(1000) + + # TODO: Add tests for normal data with dependencies # dep_df = generate_normal_data_dep(1000) +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): + def __init__(self): + pbn.BayesianNetworkType.__init__(self) + + def is_homogeneous(self): + return True + + def default_node_type(self): + return pbn.LinearGaussianCPDType() + + def can_have_arc(self, model, source, target): + return "A" in source + + def new_bn(self, nodes): + return NewBN(nodes) + + def __str__(self): + return "MyRestrictedGaussianNetworkType" + + +class NewBN(pbn.BayesianNetwork): + def __init__(self, variables, arcs=None): + if arcs is None: + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) + else: + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) + + self.extra_data = "extra" + + def __getstate_extra__(self): + return self.extra_data + + def __setstate_extra__(self, extra): + self.extra_data = extra + + def test_hc_estimate(): bic = pbn.BIC(df) column_names = list(df.columns.values) @@ -209,6 +250,19 @@ def test_hc_shortcut_function(): assert type(model) == NewBN +def test_newbn_estimate_validation(): + start = NewBN(["A", "B", "C", "D"]) + hc = pbn.GreedyHillClimbing() + arc = pbn.ArcOperatorSet() + bic = pbn.BIC(df) + + estimated = hc.estimate(arc, bic, start) + + assert type(start) == type(estimated) + assert estimated.extra_data == "extra" + + +# TODO: Test for when one variable has 0 variance in k-fold cross-validation for CKDEType # # NOTE: Deprecated test for PyBNesian with full covariance matrices # def test_hc_arc_singular_covariance(): # """Function to test if with the GBN, KDE and SPBN, the HC algorithm raises an exception when the covariance matrix is singular. Then we check if the learnt model is valid.""" @@ -254,56 +308,3 @@ def test_hc_shortcut_function(): # assert np.count_nonzero(np.isnan(spbn.logl(dep_df))) == 0 # for c in column_names: # print(f"{spbn.cpd(c)}") - - -# TODO: Test for when one variable has 0 variance in k-fold cross-validation for CKDEType - - -class MyRestrictedGaussianNetworkType(BayesianNetworkType): - def __init__(self): - BayesianNetworkType.__init__(self) - - def is_homogeneous(self): - return True - - def default_node_type(self): - return pbn.LinearGaussianCPDType() - - def can_have_arc(self, model, source, target): - return "A" in source - - def new_bn(self, nodes): - return NewBN(nodes) - - def __str__(self): - return "MyRestrictedGaussianNetworkType" - - -class NewBN(BayesianNetwork): - def __init__(self, variables, arcs=None): - if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) - else: - BayesianNetwork.__init__( - self, MyRestrictedGaussianNetworkType(), variables, arcs - ) - - self.extra_data = "extra" - - def __getstate_extra__(self): - return self.extra_data - - def __setstate_extra__(self, extra): - self.extra_data = extra - - -def test_newbn_estimate_validation(): - start = NewBN(["A", "B", "C", "D"]) - hc = pbn.GreedyHillClimbing() - arc = pbn.ArcOperatorSet() - bic = pbn.BIC(df) - - estimated = hc.estimate(arc, bic, start) - - assert type(start) == type(estimated) - assert estimated.extra_data == "extra" diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py index 4486d247..185973bf 100644 --- a/tests/learning/independence_tests/independence_test.py +++ b/tests/learning/independence_tests/independence_test.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pybnesian as pbn from helpers.data import ( DATA_SIZE, N_NEIGHBORS, @@ -11,13 +12,6 @@ generate_normal_data, generate_normal_data_independent, ) -from pybnesian import ( - ChiSquare, - KMutualInformation, - LinearCorrelation, - MutualInformation, - RCoT, -) from scipy.stats import pearsonr # from sklearn.feature_selection import mutual_info_regression @@ -43,8 +37,8 @@ def test_chi_square(): """Test the chi-square independence test with discrete data""" - chi_square = ChiSquare(discrete_data) - independent_chi_square = ChiSquare(independent_discrete_data) + chi_square = pbn.ChiSquare(discrete_data) + independent_chi_square = pbn.ChiSquare(independent_discrete_data) p_value = chi_square.pvalue("A", "B") independent_p_value = independent_chi_square.pvalue("A", "B") @@ -61,8 +55,8 @@ def test_linear_correlation(): independent_df = independent_data[["A", "B"]] # Pybnesian Linear correlation - linear_correlation = LinearCorrelation(df) - independent_linear_correlation = LinearCorrelation(independent_df) + linear_correlation = pbn.LinearCorrelation(df) + independent_linear_correlation = pbn.LinearCorrelation(independent_df) pvalue = linear_correlation.pvalue("A", "B") independent_pvalue = independent_linear_correlation.pvalue("A", "B") @@ -99,8 +93,8 @@ def test_linear_correlation(): def test_mutual_info(): """Test the mutual information independence test with normal data""" - mutual_info = MutualInformation(data) - independent_mutual_info = MutualInformation(independent_data) + mutual_info = pbn.MutualInformation(data) + independent_mutual_info = pbn.MutualInformation(independent_data) # Check whether the mutual information is higher when the variables are dependent mutual_info_value = mutual_info.mi("A", "B") @@ -116,8 +110,8 @@ def test_mutual_info(): def test_k_mutual_info(): """Test the k-nearest neighbors mutual information independence test with normal data""" - k_mutual_info = KMutualInformation(data, k=N_NEIGHBORS) - independent_k_mutual_info = KMutualInformation(independent_data, k=N_NEIGHBORS) + k_mutual_info = pbn.KMutualInformation(data, k=N_NEIGHBORS) + independent_k_mutual_info = pbn.KMutualInformation(independent_data, k=N_NEIGHBORS) # Check whether the mutual information is higher when the variables are dependent k_mutual_info_value = k_mutual_info.mi("A", "B") @@ -148,9 +142,11 @@ def test_k_mutual_info(): def test_rcot(): - """Test the Randomized Conditional Correlation Test (RCoT) independence test with normal data""" - rcot = RCoT(data, random_fourier_xy=5, random_fourier_z=100) - independent_rcot = RCoT(independent_data, random_fourier_xy=5, random_fourier_z=100) + """Test the Randomized Conditional Correlation Test (pbn.RCoT) independence test with normal data""" + rcot = pbn.RCoT(data, random_fourier_xy=5, random_fourier_z=100) + independent_rcot = pbn.RCoT( + independent_data, random_fourier_xy=5, random_fourier_z=100 + ) p_value = rcot.pvalue("A", "B") independent_p_value = independent_rcot.pvalue("A", "B") diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index 14ce8d04..fec37dcf 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -2,64 +2,105 @@ import pybnesian as pbn import pytest from helpers.data import DATA_SIZE, generate_normal_data -from pybnesian import BayesianNetwork, GaussianNetwork df = generate_normal_data(DATA_SIZE) -def test_create_bn(): - gbn = GaussianNetwork(["A", "B", "C", "D"]) +def test_create_gaussian_bn(): + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 0 assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "C")]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "C")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 1 assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 3 assert gbn.nodes() == ["A", "C", "B", "D"] with pytest.raises(TypeError) as ex: - gbn = GaussianNetwork(["A", "B", "C"], [("A", "C", "B")]) + gbn = pbn.GaussianNetwork(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: - gbn = GaussianNetwork(["A", "B", "C"], [("A", "D")]) + gbn = pbn.GaussianNetwork(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork([("A", "B"), ("B", "C"), ("C", "A")]) + gbn = pbn.GaussianNetwork([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork( + gbn = pbn.GaussianNetwork( ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = BayesianNetwork( + gbn = pbn.BayesianNetwork( pbn.GaussianNetworkType(), ["A", "B", "C", "D"], [], [("A", pbn.CKDEType())] ) assert "Wrong factor type" in str(ex.value) -def gbn_generator(): - # Test different Networks created with different constructors. - gbn = GaussianNetwork(["A", "B", "C", "D"]) - yield gbn - gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) - yield gbn - gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) - yield gbn +def test_create_discrete_bn(): + dbn = pbn.DiscreteBN(["A", "B", "C", "D"]) + + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 0 + assert dbn.nodes() == ["A", "B", "C", "D"] + + dbn = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "C")]) + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 1 + assert dbn.nodes() == ["A", "B", "C", "D"] + + dbn = pbn.DiscreteBN([("A", "C"), ("B", "D"), ("C", "D")]) + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 3 + assert dbn.nodes() == ["A", "C", "B", "D"] + + with pytest.raises(TypeError) as ex: + dbn = pbn.DiscreteBN(["A", "B", "C"], [("A", "C", "B")]) + assert "incompatible constructor arguments" in str(ex.value) + + with pytest.raises(IndexError) as ex: + dbn = pbn.DiscreteBN(["A", "B", "C"], [("A", "D")]) + assert "not present in the graph" in str(ex.value) + + with pytest.raises(ValueError) as ex: + dbn = pbn.DiscreteBN([("A", "B"), ("B", "C"), ("C", "A")]) + assert "must be a DAG" in str(ex.value) + + with pytest.raises(ValueError) as ex: + dbn = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")]) + assert "must be a DAG" in str(ex.value) + + with pytest.raises(ValueError) as ex: + dbn = pbn.BayesianNetwork( + pbn.DiscreteBNType(), + ["A", "B", "C", "D"], + [], + [("A", pbn.CKDEType())], + ) + assert "Wrong factor type" in str(ex.value) def test_nodes_util(): + def gbn_generator(): + # Test different Networks created with different constructors. + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) + yield gbn + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) + yield gbn + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) + yield gbn + for gbn in gbn_generator(): assert gbn.num_nodes() == 4 @@ -84,7 +125,7 @@ def test_nodes_util(): def test_parent_children(): - gbn = GaussianNetwork(["A", "B", "C", "D"]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_parents("A") == 0 assert gbn.num_parents("B") == 0 @@ -101,7 +142,7 @@ def test_parent_children(): assert gbn.num_children("C") == 0 assert gbn.num_children("D") == 0 - gbn = GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) assert gbn.num_parents("A") == 0 assert gbn.num_parents("B") == 0 @@ -118,7 +159,7 @@ def test_parent_children(): assert gbn.num_children("C") == 1 assert gbn.num_children("D") == 0 - gbn = GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) assert gbn.num_parents("A") == 0 assert gbn.num_parents("B") == 1 @@ -137,7 +178,7 @@ def test_parent_children(): def test_arcs(): - gbn = GaussianNetwork(["A", "B", "C", "D"]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_arcs() == 0 assert gbn.arcs() == [] @@ -229,7 +270,7 @@ def test_arcs(): def test_bn_fit(): - gbn = GaussianNetwork( + gbn = pbn.GaussianNetwork( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) @@ -259,7 +300,7 @@ def test_bn_fit(): def test_add_cpds(): - gbn = GaussianNetwork( + gbn = pbn.GaussianNetwork( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) @@ -322,7 +363,7 @@ def test_add_cpds(): def test_bn_logl(): - gbn = GaussianNetwork( + gbn = pbn.GaussianNetwork( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) @@ -349,7 +390,7 @@ def test_bn_logl(): def test_bn_sample(): - gbn = GaussianNetwork( + gbn = pbn.GaussianNetwork( ["A", "C", "B", "D"], [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], ) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 32451f79..4f454628 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,47 +1,38 @@ import pybnesian as pbn from helpers.data import generate_normal_data_independent -from pybnesian import ( - BayesianNetwork, - BayesianNetworkType, - ConditionalBayesianNetwork, - DiscreteBN, - GaussianNetwork, - KDENetwork, - SemiparametricBN, -) def test_bn_type(): - g1 = GaussianNetwork(["A", "B", "C", "D"]) - g2 = GaussianNetwork(["A", "B", "C", "D"]) - g3 = GaussianNetwork(["A", "B", "C", "D"]) + g1 = pbn.GaussianNetwork(["A", "B", "C", "D"]) + g2 = pbn.GaussianNetwork(["A", "B", "C", "D"]) + g3 = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert g1.type() == pbn.GaussianNetworkType() assert g1.type() == g2.type() assert g1.type() == g3.type() assert g2.type() == g3.type() - s1 = SemiparametricBN(["A", "B", "C", "D"]) - s2 = SemiparametricBN(["A", "B", "C", "D"]) - s3 = SemiparametricBN(["A", "B", "C", "D"]) + s1 = pbn.SemiparametricBN(["A", "B", "C", "D"]) + s2 = pbn.SemiparametricBN(["A", "B", "C", "D"]) + s3 = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert s1.type() == pbn.SemiparametricBNType() assert s1.type() == s2.type() assert s1.type() == s3.type() assert s2.type() == s3.type() - k1 = KDENetwork(["A", "B", "C", "D"]) - k2 = KDENetwork(["A", "B", "C", "D"]) - k3 = KDENetwork(["A", "B", "C", "D"]) + k1 = pbn.KDENetwork(["A", "B", "C", "D"]) + k2 = pbn.KDENetwork(["A", "B", "C", "D"]) + k3 = pbn.KDENetwork(["A", "B", "C", "D"]) assert k1.type() == pbn.KDENetworkType() assert k1.type() == k2.type() assert k1.type() == k3.type() assert k2.type() == k3.type() - d1 = DiscreteBN(["A", "B", "C", "D"]) - d2 = DiscreteBN(["A", "B", "C", "D"]) - d3 = DiscreteBN(["A", "B", "C", "D"]) + d1 = pbn.DiscreteBN(["A", "B", "C", "D"]) + d2 = pbn.DiscreteBN(["A", "B", "C", "D"]) + d3 = pbn.DiscreteBN(["A", "B", "C", "D"]) assert d1.type() == pbn.DiscreteBNType() assert d1.type() == d2.type() @@ -57,9 +48,9 @@ def test_bn_type(): def test_new_bn_type(): - class MyGaussianNetworkType(BayesianNetworkType): + class MyGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True @@ -75,9 +66,9 @@ def can_have_arc(self, model, source, target): assert a1 == a3 assert a2 == a3 - class MySemiparametricBNType(BayesianNetworkType): + class MySemiparametricBNType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) b1 = MySemiparametricBNType() b2 = MySemiparametricBNType() @@ -89,7 +80,7 @@ def __init__(self): assert a1 != b1 - mybn = BayesianNetwork(a1, ["A", "B", "C", "D"]) + mybn = pbn.BayesianNetwork(a1, ["A", "B", "C", "D"]) # This type omits the arcs that do not have "A" as source. assert mybn.can_add_arc("A", "B") @@ -97,9 +88,9 @@ def __init__(self): assert not mybn.can_add_arc("C", "D") -class MyRestrictedGaussianNetworkType(BayesianNetworkType): +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True @@ -114,24 +105,26 @@ def __str__(self): return "MyRestrictedGaussianNetworkType" -class SpecificNetwork(BayesianNetwork): +class SpecificNetwork(pbn.BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) else: - BayesianNetwork.__init__( + pbn.BayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, arcs ) -class ConditionalSpecificNetwork(ConditionalBayesianNetwork): +class ConditionalSpecificNetwork(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, interface ) else: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, interface, arcs ) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index c41b39b6..bba9e5bf 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -5,11 +5,6 @@ import pybnesian as pbn import pytest from helpers.data import generate_normal_data -from pybnesian import ( - ConditionalGaussianNetwork, - DynamicGaussianNetwork, - GaussianNetwork, -) from scipy.stats import norm df = generate_normal_data(1000) @@ -17,7 +12,7 @@ def test_create_dbn(): variables = ["A", "B", "C", "D"] - gbn = DynamicGaussianNetwork(variables, 2) + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 assert gbn.variables() == ["A", "B", "C", "D"] @@ -31,10 +26,10 @@ def test_create_dbn(): assert set(gbn.transition_bn().interface_nodes()) == set(static_nodes) assert set(gbn.transition_bn().nodes()) == set(transition_nodes) - static_bn = GaussianNetwork(static_nodes) - transition_bn = ConditionalGaussianNetwork(transition_nodes, static_nodes) + static_bn = pbn.GaussianNetwork(static_nodes) + transition_bn = pbn.ConditionalGaussianNetwork(transition_nodes, static_nodes) - gbn2 = DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) + gbn2 = pbn.DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) assert gbn2.markovian_order() == 2 assert gbn2.variables() == ["A", "B", "C", "D"] assert gbn2.num_variables() == 4 @@ -43,20 +38,20 @@ def test_create_dbn(): wrong_transition_bn = pbn.ConditionalDiscreteBN(transition_nodes, static_nodes) with pytest.raises(ValueError) as ex: - DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) + pbn.DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) assert "Static and transition Bayesian networks do not have the same type" in str( ex.value ) wrong_static_bn = pbn.DiscreteBN(static_nodes) with pytest.raises(ValueError) as ex: - DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) + pbn.DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) assert "Bayesian networks are not Gaussian." in str(ex.value) def test_variable_operations_dbn(): variables = ["A", "B", "C", "D"] - gbn = DynamicGaussianNetwork(variables, 2) + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 assert gbn.variables() == ["A", "B", "C", "D"] @@ -91,7 +86,7 @@ def test_variable_operations_dbn(): def test_fit_dbn(): variables = ["A", "B", "C", "D"] - gbn = DynamicGaussianNetwork(variables, 2) + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert not gbn.fitted() assert not gbn.static_bn().fitted() assert not gbn.transition_bn().fitted() @@ -99,7 +94,7 @@ def test_fit_dbn(): assert gbn.fitted() ddf = pbn.DynamicDataFrame(df, 2) - gbn2 = DynamicGaussianNetwork(variables, 2) + gbn2 = pbn.DynamicGaussianNetwork(variables, 2) gbn2.static_bn().fit(ddf.static_df()) assert not gbn2.fitted() assert gbn2.static_bn().fitted() @@ -172,13 +167,13 @@ def numpy_logl(dbn, test_data): def test_logl_dbn(): variables = ["A", "B", "C", "D"] - static_bn = GaussianNetwork( + static_bn = pbn.GaussianNetwork( ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) - static_bn = GaussianNetwork( + static_bn = pbn.GaussianNetwork( ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) - gbn = DynamicGaussianNetwork(variables, 2) + gbn = pbn.DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() static_bn.add_arc("A_t_2", "C_t_2") @@ -209,13 +204,13 @@ def test_logl_dbn(): def test_slogl_dbn(): variables = ["A", "B", "C", "D"] - static_bn = GaussianNetwork( + static_bn = pbn.GaussianNetwork( ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) - static_bn = GaussianNetwork( + static_bn = pbn.GaussianNetwork( ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] ) - gbn = DynamicGaussianNetwork(variables, 2) + gbn = pbn.DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() static_bn.add_arc("A_t_2", "C_t_2") diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 7b9cc53b..713a348d 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -2,13 +2,12 @@ import pybnesian as pbn import pytest from helpers.data import DATA_SIZE, generate_normal_data -from pybnesian import CKDE, LinearGaussianCPD, SemiparametricBN df = generate_normal_data(DATA_SIZE) def test_create_spbn(): - spbn = SemiparametricBN(["A", "B", "C", "D"]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 assert spbn.nodes() == ["A", "B", "C", "D"] @@ -16,7 +15,7 @@ def test_create_spbn(): for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN(["A", "B", "C", "D"], [("A", "C")]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"], [("A", "C")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 assert spbn.nodes() == ["A", "B", "C", "D"] @@ -24,7 +23,7 @@ def test_create_spbn(): for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN([("A", "C"), ("B", "D"), ("C", "D")]) + spbn = pbn.SemiparametricBN([("A", "C"), ("B", "D"), ("C", "D")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 assert spbn.nodes() == ["A", "C", "B", "D"] @@ -33,19 +32,19 @@ def test_create_spbn(): assert spbn.node_type(n) == pbn.UnknownFactorType() with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(["A", "B", "C"], [("A", "C", "B")]) + spbn = pbn.SemiparametricBN(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(["A", "B", "C"], [("A", "D")]) + spbn = pbn.SemiparametricBN(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([("A", "B"), ("B", "C"), ("C", "A")]) + spbn = pbn.SemiparametricBN([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] ) assert "must be a DAG" in str(ex.value) @@ -57,7 +56,7 @@ def test_create_spbn(): "D": pbn.UnknownFactorType(), } - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C", "D"], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())] ) assert spbn.num_nodes() == 4 @@ -67,7 +66,7 @@ def test_create_spbn(): for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C", "D"], [("A", "C")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], @@ -79,7 +78,7 @@ def test_create_spbn(): for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "C"), ("B", "D"), ("C", "D")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) @@ -91,7 +90,7 @@ def test_create_spbn(): assert spbn.node_type(n) == expected_node_type[n] with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C"], [("A", "C", "B")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], @@ -99,7 +98,7 @@ def test_create_spbn(): assert "incompatible constructor arguments" in str(ex.value) with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C"], [("A", "D")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], @@ -107,14 +106,14 @@ def test_create_spbn(): assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "B"), ("B", "C"), ("C", "A")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], @@ -123,7 +122,7 @@ def test_create_spbn(): def test_node_type(): - spbn = SemiparametricBN(["A", "B", "C", "D"]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 assert spbn.nodes() == ["A", "B", "C", "D"] @@ -138,7 +137,7 @@ def test_node_type(): def test_fit(): - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) @@ -153,7 +152,7 @@ def test_fit(): cpd = spbn.cpd(n) assert cpd.type() == pbn.LinearGaussianCPDType() - assert type(cpd) == pbn.LinearGaussianCPD + assert type(cpd) == pbn.pbn.LinearGaussianCPD assert cpd.variable() == n assert set(cpd.evidence()) == set(spbn.parents(n)) @@ -162,12 +161,12 @@ def test_fit(): spbn.remove_arc("A", "B") cpd_b = spbn.cpd("B") - assert type(cpd_b) == pbn.LinearGaussianCPD + assert type(cpd_b) == pbn.pbn.LinearGaussianCPD assert cpd_b.evidence != spbn.parents("B") spbn.fit(df) cpd_b = spbn.cpd("B") - assert type(cpd_b) == pbn.LinearGaussianCPD + assert type(cpd_b) == pbn.pbn.LinearGaussianCPD assert cpd_b.evidence() == spbn.parents("B") spbn.set_node_type("C", pbn.CKDEType()) @@ -182,7 +181,7 @@ def test_fit(): def test_cpd(): - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], [("D", pbn.CKDEType())], ) @@ -205,21 +204,21 @@ def test_cpd(): def test_add_cpds(): - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], [("D", pbn.CKDEType())], ) assert spbn.node_type("A") == pbn.UnknownFactorType() - spbn.add_cpds([CKDE("A", [])]) + spbn.add_cpds([pbn.CKDE("A", [])]) assert spbn.node_type("A") == pbn.CKDEType() with pytest.raises(ValueError) as ex: - spbn.add_cpds([LinearGaussianCPD("D", ["A", "B", "C"])]) + spbn.add_cpds([pbn.LinearGaussianCPD("D", ["A", "B", "C"])]) assert "Bayesian network expects type" in str(ex.value) - lg = LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) - ckde = CKDE("D", ["A", "B", "C"]) + lg = pbn.LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) + ckde = pbn.CKDE("D", ["A", "B", "C"]) assert lg.fitted() assert not ckde.fitted() @@ -246,7 +245,7 @@ def test_add_cpds(): def test_logl(): - spbn = SemiparametricBN( + spbn = pbn.SemiparametricBN( [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] ) diff --git a/tests/serialization/serialize_factor_test.py b/tests/serialization/serialize_factor_test.py index d6f17f0d..2e7b5481 100644 --- a/tests/serialization/serialize_factor_test.py +++ b/tests/serialization/serialize_factor_test.py @@ -4,30 +4,29 @@ import pandas as pd import pybnesian as pbn import pytest -from pybnesian import CKDE, DiscreteFactor, Factor, FactorType, LinearGaussianCPD @pytest.fixture def lg_bytes(): - lg = LinearGaussianCPD("C", ["A", "B"]) + lg = pbn.LinearGaussianCPD("C", ["A", "B"]) return pickle.dumps(lg) @pytest.fixture def ckde_bytes(): - ckde = CKDE("C", ["A", "B"]) + ckde = pbn.CKDE("C", ["A", "B"]) return pickle.dumps(ckde) @pytest.fixture def discrete_bytes(): - discrete = DiscreteFactor("C", ["A", "B"]) + discrete = pbn.DiscreteFactor("C", ["A", "B"]) return pickle.dumps(discrete) -class NewType(FactorType): +class NewType(pbn.FactorType): def __init__(self, factor_class): - FactorType.__init__(self) + pbn.FactorType.__init__(self) self.factor_class = factor_class def new_factor(self, model, variable, evidence): @@ -37,9 +36,9 @@ def __str__(self): return "NewType" -class NewFactor(Factor): +class NewFactor(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) self._fitted = False self.some_fit_data = None @@ -65,9 +64,9 @@ def __setstate_extra__(self, d): self.some_fit_data = d["some_fit_data"] -class NewFactorBis(Factor): +class NewFactorBis(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) self._fitted = False self.some_fit_data = None @@ -94,7 +93,7 @@ def __getstate__(self): return d def __setstate__(self, d): - Factor.__init__(self, d["variable"], d["evidence"]) + pbn.Factor.__init__(self, d["variable"], d["evidence"]) self._fitted = d["fitted"] self.some_fit_data = d["some_fit_data"] @@ -140,9 +139,7 @@ def test_serialization_unfitted_factor( nn = NewFactor("A", []) assert loaded_new.type() == nn.type() - from pybnesian import GaussianNetwork - - dummy_network = GaussianNetwork(["A", "B", "C", "D"]) + dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert type(loaded_new.type().new_factor(dummy_network, "A", [])) == NewFactor loaded_newbis = pickle.loads(newbis_bytes) @@ -165,7 +162,7 @@ def test_serialization_unfitted_factor( @pytest.fixture def lg_fitted_bytes(): - lg = LinearGaussianCPD("C", ["A", "B"], [1, 2, 3], 0.5) + lg = pbn.LinearGaussianCPD("C", ["A", "B"], [1, 2, 3], 0.5) return pickle.dumps(lg) @@ -175,14 +172,14 @@ def ckde_fitted_bytes(): data = pd.DataFrame( {"A": np.random.rand(10), "B": np.random.rand(10), "C": np.random.rand(10)} ).astype(float) - ckde = CKDE("C", ["A", "B"]) + ckde = pbn.CKDE("C", ["A", "B"]) ckde.fit(data) return pickle.dumps(ckde) @pytest.fixture def discrete_fitted_bytes(): - discrete = DiscreteFactor("C", ["A", "B"]) + discrete = pbn.DiscreteFactor("C", ["A", "B"]) data = pd.DataFrame( { diff --git a/tests/serialization/serialize_factor_type_test.py b/tests/serialization/serialize_factor_type_test.py index 12777d04..7377ceae 100644 --- a/tests/serialization/serialize_factor_type_test.py +++ b/tests/serialization/serialize_factor_type_test.py @@ -1,9 +1,7 @@ import pickle -import pytest - import pybnesian as pbn -from pybnesian import FactorType +import pytest @pytest.fixture @@ -24,14 +22,14 @@ def discrete_type_bytes(): return pickle.dumps(discrete) -class NewType(FactorType): +class NewType(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) -class OtherType(FactorType): +class OtherType(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) @pytest.fixture diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index f7c50949..12d39de5 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -4,47 +4,37 @@ import pybnesian as pbn import pytest from helpers.data import generate_discrete_data, generate_normal_data_independent -from pybnesian import ( - CKDE, - BayesianNetwork, - BayesianNetworkType, - ConditionalBayesianNetwork, - DiscreteBN, - DiscreteFactor, - GaussianNetwork, - KDENetwork, - LinearGaussianCPD, - SemiparametricBN, -) @pytest.fixture def gaussian_bytes(): - gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(gaussian) @pytest.fixture def spbn_bytes(): - spbn = SemiparametricBN(["A", "B", "C", "D"], [("A", "B")], [("B", pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], [("A", "B")], [("B", pbn.CKDEType())] + ) return pickle.dumps(spbn) @pytest.fixture def kde_bytes(): - kde = KDENetwork(["A", "B", "C", "D"], [("A", "B")]) + kde = pbn.KDENetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(kde) @pytest.fixture def discrete_bytes(): - discrete = DiscreteBN(["A", "B", "C", "D"], [("A", "B")]) + discrete = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(discrete) -class MyRestrictedGaussianNetworkType(BayesianNetworkType): +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True @@ -67,18 +57,20 @@ def __str__(self): @pytest.fixture def genericbn_bytes(): - gen = BayesianNetwork( + gen = pbn.BayesianNetwork( MyRestrictedGaussianNetworkType(), ["A", "B", "C", "D"], [("A", "B")] ) return pickle.dumps(gen) -class NewBN(BayesianNetwork): +class NewBN(pbn.BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) else: - BayesianNetwork.__init__( + pbn.BayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, arcs ) @@ -89,9 +81,9 @@ def newbn_bytes(): return pickle.dumps(new) -class NonHomogeneousType(BayesianNetworkType): +class NonHomogeneousType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return False @@ -112,20 +104,22 @@ def __str__(self): return "NonHomogeneousType" -class OtherBN(BayesianNetwork): +class OtherBN(pbn.BayesianNetwork): def __init__(self, variables, arcs=None, node_types=None): if arcs is None: if node_types is None: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables) + pbn.BayesianNetwork.__init__(self, NonHomogeneousType(), variables) else: - BayesianNetwork.__init__( + pbn.BayesianNetwork.__init__( self, NonHomogeneousType(), variables, node_types ) else: if node_types is None: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, arcs) + pbn.BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, arcs + ) else: - BayesianNetwork.__init__( + pbn.BayesianNetwork.__init__( self, NonHomogeneousType(), variables, arcs, node_types ) @@ -214,8 +208,8 @@ def test_serialization_bn_model( @pytest.fixture def gaussian_partial_fit_bytes(): - gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) - lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg = pbn.pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -223,11 +217,11 @@ def gaussian_partial_fit_bytes(): @pytest.fixture def gaussian_fit_bytes(): - gaussian = GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) - lg_a = LinearGaussianCPD("A", [], [0], 0.5) - lg_b = LinearGaussianCPD("B", ["A"], [1, 2], 2) - lg_c = LinearGaussianCPD("C", [], [2], 1) - lg_d = LinearGaussianCPD("D", [], [3], 1.5) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg_a = pbn.LinearGaussianCPD("A", [], [0], 0.5) + lg_b = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) + lg_c = pbn.LinearGaussianCPD("C", [], [2], 1) + lg_d = pbn.LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_a, lg_b, lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -244,7 +238,7 @@ def other_partial_fit_bytes(): ("D", pbn.DiscreteFactorType()), ], ) - lg = LinearGaussianCPD("B", ["A"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) @@ -261,15 +255,15 @@ def other_fit_bytes(): ("D", pbn.DiscreteFactorType()), ], ) - cpd_a = LinearGaussianCPD("A", [], [0], 0.5) - cpd_b = LinearGaussianCPD("B", ["A"], [1, 2], 2) + cpd_a = pbn.LinearGaussianCPD("A", [], [0], 0.5) + cpd_b = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) df_continuous = generate_normal_data_independent(100) - cpd_c = CKDE("C", []) + cpd_c = pbn.CKDE("C", []) cpd_c.fit(df_continuous) df_discrete = generate_discrete_data(100) - cpd_d = DiscreteFactor("D", []) + cpd_d = pbn.DiscreteFactor("D", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) @@ -403,20 +397,20 @@ def cond_discrete_bytes(): @pytest.fixture def cond_genericbn_bytes(): - gen = ConditionalBayesianNetwork( + gen = pbn.ConditionalBayesianNetwork( MyRestrictedGaussianNetworkType(), ["C", "D"], ["A", "B"], [("A", "C")] ) return pickle.dumps(gen) -class ConditionalNewBN(ConditionalBayesianNetwork): +class ConditionalNewBN(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, interface ) else: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, MyRestrictedGaussianNetworkType(), variables, interface, arcs ) @@ -427,24 +421,24 @@ def cond_newbn_bytes(): return pickle.dumps(new) -class ConditionalOtherBN(ConditionalBayesianNetwork): +class ConditionalOtherBN(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None, node_types=None): if arcs is None: if node_types is None: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, NonHomogeneousType(), variables, interface ) else: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, NonHomogeneousType(), variables, interface, node_types ) else: if node_types is None: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, NonHomogeneousType(), variables, interface, arcs ) else: - ConditionalBayesianNetwork.__init__( + pbn.ConditionalBayesianNetwork.__init__( self, NonHomogeneousType(), variables, interface, arcs, node_types ) @@ -540,7 +534,7 @@ def test_serialization_conditional_bn_model( @pytest.fixture def cond_gaussian_partial_fit_bytes(): gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) - lg = LinearGaussianCPD("C", ["A"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("C", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -549,8 +543,8 @@ def cond_gaussian_partial_fit_bytes(): @pytest.fixture def cond_gaussian_fit_bytes(): gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) - lg_c = LinearGaussianCPD("C", ["A"], [1, 2], 2) - lg_d = LinearGaussianCPD("D", [], [3], 1.5) + lg_c = pbn.LinearGaussianCPD("C", ["A"], [1, 2], 2) + lg_d = pbn.LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -564,7 +558,7 @@ def cond_other_partial_fit_bytes(): [("A", "C")], [("C", pbn.CKDEType()), ("D", pbn.LinearGaussianCPDType())], ) - lg = LinearGaussianCPD("D", [], [3], 1.5) + lg = pbn.LinearGaussianCPD("D", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) @@ -578,14 +572,14 @@ def cond_other_fit_bytes(): [("A", "C")], [("C", pbn.CKDEType()), ("D", pbn.DiscreteFactorType())], ) - cpd_c = CKDE("C", ["A"]) - cpd_d = DiscreteFactor("D", []) + cpd_c = pbn.CKDE("C", ["A"]) + cpd_d = pbn.DiscreteFactor("D", []) df_continuous = generate_normal_data_independent(100) cpd_c.fit(df_continuous) df_discrete = generate_discrete_data(100) - cpd_d = DiscreteFactor("D", []) + cpd_d = pbn.DiscreteFactor("D", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_c, cpd_d]) @@ -822,9 +816,9 @@ def dyn_gaussian_partial_fit_bytes(): gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) gaussian.static_bn().add_arc("A_t_2", "D_t_1") gaussian.transition_bn().add_arc("C_t_2", "B_t_0") - lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) gaussian.static_bn().add_cpds([lg]) - lg = LinearGaussianCPD("B_t_0", ["C_t_2"], [3, 4], 5) + lg = pbn.LinearGaussianCPD("B_t_0", ["C_t_2"], [3, 4], 5) gaussian.transition_bn().add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) @@ -856,7 +850,7 @@ def dyn_other_partial_fit_bytes(): ("D_t_1", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( @@ -869,7 +863,7 @@ def dyn_other_partial_fit_bytes(): ("D_t_0", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) + lg = pbn.LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() @@ -895,7 +889,7 @@ def dyn_other_fit_bytes(): ("D_t_1", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( @@ -908,7 +902,7 @@ def dyn_other_fit_bytes(): ("D_t_0", pbn.LinearGaussianCPDType()), ], ) - lg = LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) + lg = pbn.LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() From d02e5553c4bacfaa9d454f7a5a7acbbc58999686 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Wed, 26 Mar 2025 14:08:28 +0000 Subject: [PATCH 62/75] typo fixes --- tests/factors/factor_type_test.py | 2 +- tests/models/SemiparametricBN_test.py | 6 +++--- tests/serialization/serialize_models_test.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index 54ca22ec..b3a4184f 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -91,7 +91,7 @@ def type(self): dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) with pytest.raises(RuntimeError) as ex: f1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) - assert 'Tried to call pure virtual function "pbn.FactorType::new_factor"' in str( + assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( ex.value ) diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 713a348d..5d79219d 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -152,7 +152,7 @@ def test_fit(): cpd = spbn.cpd(n) assert cpd.type() == pbn.LinearGaussianCPDType() - assert type(cpd) == pbn.pbn.LinearGaussianCPD + assert type(cpd) == pbn.LinearGaussianCPD assert cpd.variable() == n assert set(cpd.evidence()) == set(spbn.parents(n)) @@ -161,12 +161,12 @@ def test_fit(): spbn.remove_arc("A", "B") cpd_b = spbn.cpd("B") - assert type(cpd_b) == pbn.pbn.LinearGaussianCPD + assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence != spbn.parents("B") spbn.fit(df) cpd_b = spbn.cpd("B") - assert type(cpd_b) == pbn.pbn.LinearGaussianCPD + assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence() == spbn.parents("B") spbn.set_node_type("C", pbn.CKDEType()) diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index 12d39de5..eb271e83 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -209,7 +209,7 @@ def test_serialization_bn_model( @pytest.fixture def gaussian_partial_fit_bytes(): gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) - lg = pbn.pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) + lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) From 97c1d3c4eaf20b20403c70f90b345ebe2d2639f5 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 28 Mar 2025 07:39:11 +0000 Subject: [PATCH 63/75] Add function to generate discrete data classification with dependent variables --- tests/helpers/data.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index 900a4f9b..0464eab0 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -373,6 +373,84 @@ def generate_hybrid_data_independent(size: int, seed: int = SEED) -> pd.DataFram return df +def generate_discrete_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with dependent variables and a true label. + The relationships are as follows: + - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ Categorical(0.6, 0.4) if TRUE_LABEL = class1, else Categorical(0.8, 0.2) if TRUE_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_LABEL = class3 + - B ~ Categorical(0.5, 0.3, 0.2) if TRUE_LABEL = class1, else Categorical(0.2, 0.5, 0.3) if TRUE_LABEL = class2, else Categorical(0.3, 0.3, 0.4) if TRUE_LABEL = class3 + - C ~ Categorical(0.7, 0.3) if TRUE_LABEL = class1, else Categorical(0.4, 0.6) if TRUE_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_LABEL = class3 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + class_dict = np.asarray(["class1", "class2", "class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + + a_values = np.empty(size, dtype=object) + b_values = np.empty(size, dtype=object) + c_values = np.empty(size, dtype=object) + + # Indices + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" + + # Sampling + a_values[class1_indices] = a_dict[ + np.random.choice(a_dict.size, class1_indices.sum(), p=[0.6, 0.4]) + ] + a_values[class2_indices] = a_dict[ + np.random.choice(a_dict.size, class2_indices.sum(), p=[0.8, 0.2]) + ] + a_values[class3_indices] = a_dict[ + np.random.choice(a_dict.size, class3_indices.sum(), p=[0.5, 0.5]) + ] + + b_values[class1_indices] = b_dict[ + np.random.choice(b_dict.size, class1_indices.sum(), p=[0.5, 0.3, 0.2]) + ] + b_values[class2_indices] = b_dict[ + np.random.choice(b_dict.size, class2_indices.sum(), p=[0.2, 0.5, 0.3]) + ] + b_values[class3_indices] = b_dict[ + np.random.choice(b_dict.size, class3_indices.sum(), p=[0.3, 0.3, 0.4]) + ] + + c_values[class1_indices] = c_dict[ + np.random.choice(c_dict.size, class1_indices.sum(), p=[0.7, 0.3]) + ] + c_values[class2_indices] = c_dict[ + np.random.choice(c_dict.size, class2_indices.sum(), p=[0.4, 0.6]) + ] + c_values[class3_indices] = c_dict[ + np.random.choice(c_dict.size, class3_indices.sum(), p=[0.5, 0.5]) + ] + + # DataFrame + df = pd.DataFrame( + { + TRUE_LABEL: pd.Series(class_values, dtype="category"), + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": pd.Series(c_values, dtype="category"), + } + ) + return df + + def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of normally distributed data with linear Gaussian relationships and a true label. The relationships are as follows: From a1213feb99194a41f69a15cde7347e59f789a75b Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 28 Mar 2025 14:45:05 +0100 Subject: [PATCH 64/75] Hybrid MI numerical limits fix --- .../hybrid/mutual_information.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mutual_information.cpp b/pybnesian/learning/independences/hybrid/mutual_information.cpp index 9f69f0a4..72d0b29a 100644 --- a/pybnesian/learning/independences/hybrid/mutual_information.cpp +++ b/pybnesian/learning/independences/hybrid/mutual_information.cpp @@ -920,6 +920,7 @@ std::pair conditional_covariance( double entropy_mvn(int dimensionality, double cov_det) { auto d = static_cast(dimensionality); + return 0.5 * d + 0.5 * d * std::log(2 * util::pi) + 0.5 * std::log(cov_det); } @@ -951,7 +952,7 @@ double MutualInformation::mi_discrete(const std::string& x, const std::string& y } } - return mi; + return std::max(mi, util::machine_tol); } template @@ -1020,6 +1021,7 @@ double MutualInformation::mi_mixed_impl(const std::string& discrete, const std:: // Add H(Y_C) double mi = 0.5 + 0.5 * std::log(2 * util::pi * total_variance); + for (auto j = 0; j < num_categories; ++j) { if (counts(j) > 0) { auto pj = static_cast(counts(j)) / total_counts; @@ -1056,9 +1058,10 @@ template double MutualInformation::mi_continuous_impl(const std::string& x, const std::string& y) const { auto pcov = m_df.cov(x, y); auto& cov = *pcov; - auto cor = cov(0, 1) / sqrt(cov(0, 0) * cov(1, 1)); - return -0.5 * std::log(1 - cor * cor); + + auto mi = -0.5 * std::log(1 - cor * cor); + return std::max(mi, util::machine_tol); } double MutualInformation::mi_continuous(const std::string& x, const std::string& y) const { @@ -1124,12 +1127,14 @@ double MutualInformation::calculate_df(const std::string& x, const std::string& double MutualInformation::pvalue(const std::string& x, const std::string& y) const { auto mi_value = mi(x, y); + // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y); if (std::isinf(mi_value) || std::isnan(mi_value)) { return 1; } + auto df = calculate_df(x, y); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1254,7 +1259,8 @@ double MutualInformation::cmi_discrete_continuous_impl(const std::string& x, double pi = static_cast(ni) / total_counts; double pj = static_cast(nj) / total_counts; - auto h_xy = 0.5 + 0.5 * std::log(2 * util::pi * variance_xy(k)); + double h_xy = 0.5 + 0.5 * std::log(2 * util::pi * variance_xy(k)); + mi += pij * (-h_xy + std::log(pij / (pi * pj))); } } @@ -1386,6 +1392,7 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con if (std::isinf(mi_value) || std::isnan(mi_value)) { return 1; } + auto df = calculate_df(x, y, z); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1450,7 +1457,7 @@ double MutualInformation::cmi_discrete_discrete(const std::string& x, } // mi contains N*MI(X; Y). - return mi; + return std::max(mi, util::machine_tol);; } double MutualInformation::cmi_general_both_discrete(const std::string& x, @@ -1756,6 +1763,7 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con if (std::isinf(mi_value) || std::isnan(mi_value)) { return 1; } + auto df = calculate_df(x, y, discrete_z, continuous_z); boost::math::chi_squared_distribution chidist(static_cast(df)); From 498175d84e97a38ef7c3aa6632922997ae673890 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 1 Apr 2025 08:15:59 +0000 Subject: [PATCH 65/75] removed extra semicolon --- pybnesian/learning/independences/hybrid/mutual_information.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pybnesian/learning/independences/hybrid/mutual_information.cpp b/pybnesian/learning/independences/hybrid/mutual_information.cpp index 72d0b29a..029da9e9 100644 --- a/pybnesian/learning/independences/hybrid/mutual_information.cpp +++ b/pybnesian/learning/independences/hybrid/mutual_information.cpp @@ -1457,7 +1457,7 @@ double MutualInformation::cmi_discrete_discrete(const std::string& x, } // mi contains N*MI(X; Y). - return std::max(mi, util::machine_tol);; + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_both_discrete(const std::string& x, From ae5944fab1c8be3061c8ea51e65c7ded40cc9a72 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 1 Apr 2025 10:07:41 +0000 Subject: [PATCH 66/75] Enable ccache for faster compilation if available --- CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85552886..93819cc4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,16 @@ if(UNIX) set(CMAKE_CXX_COMPILER "g++") endif() +# Enable ccache if available +find_program(CCACHE_PROGRAM ccache) +if(CCACHE_PROGRAM) + message(STATUS "ccache found: ${CCACHE_PROGRAM}") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +else() + message(STATUS "ccache not found. Compilation will proceed without caching.") +endif() + find_package(Git REQUIRED) message("Git executable: ${GIT_EXECUTABLE}") From 631964ea8ec1a1c02d5cdbe1ac0e49a610b34406 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 1 Apr 2025 10:08:43 +0000 Subject: [PATCH 67/75] Add Carlos Li Hu as a co-author in documentation and project metadata --- docs/source/conf.py | 2 +- pyproject.toml | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7cc18217..cf13de10 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,7 +19,7 @@ project = "PyBNesian" copyright = "2024, David Atienza" -author = "David Atienza" +author = "David Atienza, Carlos Li Hu" # The full version, including alpha/beta/rc tags version = "0.5.1" diff --git a/pyproject.toml b/pyproject.toml index 8d601d67..6846b4a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,10 @@ sdist.exclude = ["vcpkg/*", "docs/"] [project] name = "pybnesian" -authors = [{ name = "David Atienza", email = "datienza@fi.upm.es" }] +authors = [ + { name = "David Atienza", email = "datienza@fi.upm.es" }, + { name = "Carlos Li Hu", email = "carloslihu96@gmail.com" }, +] description = "PyBNesian is a Python package that implements Bayesian networks." version = "0.5.1" readme = { file = "README.md", content-type = "text/markdown" } From bdf2685133f06e5f905a65f79d53255367284f69 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 10 Apr 2025 07:14:08 +0000 Subject: [PATCH 68/75] Rename TRUE_LABEL to TRUE_CLASS_LABEL for consistency in data generation functions --- tests/helpers/data.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index 0464eab0..8833fd92 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -2,7 +2,7 @@ import pandas as pd # Constants -TRUE_LABEL = "attack_label" +TRUE_CLASS_LABEL = "attack_label" SUPER_PARENT = "A" DATA_SIZE = 10000 SAMPLE_SIZE = 100 @@ -376,10 +376,10 @@ def generate_hybrid_data_independent(size: int, seed: int = SEED) -> pd.DataFram def generate_discrete_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of discrete data with dependent variables and a true label. The relationships are as follows: - - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) - - A ~ Categorical(0.6, 0.4) if TRUE_LABEL = class1, else Categorical(0.8, 0.2) if TRUE_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_LABEL = class3 - - B ~ Categorical(0.5, 0.3, 0.2) if TRUE_LABEL = class1, else Categorical(0.2, 0.5, 0.3) if TRUE_LABEL = class2, else Categorical(0.3, 0.3, 0.4) if TRUE_LABEL = class3 - - C ~ Categorical(0.7, 0.3) if TRUE_LABEL = class1, else Categorical(0.4, 0.6) if TRUE_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_LABEL = class3 + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ Categorical(0.6, 0.4) if TRUE_CLASS_LABEL = class1, else Categorical(0.8, 0.2) if TRUE_CLASS_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_CLASS_LABEL = class3 + - B ~ Categorical(0.5, 0.3, 0.2) if TRUE_CLASS_LABEL = class1, else Categorical(0.2, 0.5, 0.3) if TRUE_CLASS_LABEL = class2, else Categorical(0.3, 0.3, 0.4) if TRUE_CLASS_LABEL = class3 + - C ~ Categorical(0.7, 0.3) if TRUE_CLASS_LABEL = class1, else Categorical(0.4, 0.6) if TRUE_CLASS_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_CLASS_LABEL = class3 Args: size (int): The sample size. @@ -442,7 +442,7 @@ def generate_discrete_data_classification(size: int, seed: int = SEED) -> pd.Dat # DataFrame df = pd.DataFrame( { - TRUE_LABEL: pd.Series(class_values, dtype="category"), + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), "A": pd.Series(a_values, dtype="category"), "B": pd.Series(b_values, dtype="category"), "C": pd.Series(c_values, dtype="category"), @@ -454,7 +454,7 @@ def generate_discrete_data_classification(size: int, seed: int = SEED) -> pd.Dat def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of normally distributed data with linear Gaussian relationships and a true label. The relationships are as follows: - - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) - A ~ N(-4.2, 0.75) - B ~ N(0, 0.25) if class = class1, else N(1, 0.5) if class = class2, else N(2, 1) if class = class3 - C ~ N(-2 + 2 * B, 1) if class = class1, else N(1 + 0.5 * B, 0.5) if class = class2, else N(3 + 3 * B, 0.25) if class = class3 @@ -508,7 +508,7 @@ def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataF # DataFrame df = pd.DataFrame( { - TRUE_LABEL: pd.Series(class_values, dtype="category"), + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), "A": a_values, "B": b_values, "C": c_values, @@ -522,7 +522,7 @@ def generate_non_normal_data_classification( ) -> pd.DataFrame: """Generates a DataFrame of uniformly distributed data with non-linear relationships and a true label. The relationships are as follows: - - TRUE_LABEL ~ Categorical(0.3, 0.4, 0.3) + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) - A ~ U(0, 10) - B ~ U(5, 15) if class = class1, else U(10, 20) if class = class2, else U(15, 25) if class = class3 - C ~ sin(A) + cos(B) + U(-1, 1) if class = class1, else exp(A / 10) + log(B + 1) + U(-0.5, 0.5) if class = class2, else A * B + U(-2, 2) if class = class3 @@ -573,7 +573,7 @@ def generate_non_normal_data_classification( # DataFrame df = pd.DataFrame( { - TRUE_LABEL: pd.Series(class_values, dtype="category"), + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), "A": a_values, "B": b_values, "C": c_values, From 64c6a1e2ed7040239974b7dfa246329c678f7df1 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Thu, 8 May 2025 07:08:16 +0000 Subject: [PATCH 69/75] fix: generate_discrete_data --- tests/helpers/data.py | 106 ++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 56 deletions(-) diff --git a/tests/helpers/data.py b/tests/helpers/data.py index 8833fd92..b0c2a927 100644 --- a/tests/helpers/data.py +++ b/tests/helpers/data.py @@ -116,9 +116,9 @@ def generate_discrete_data(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of discrete data with dependent variables. The relationships are as follows: - A ~ Categorical(0.75, 0.25) - - B ~ Categorical(0.33, 0.33, 0.34) if A = a1, else Categorical(0, 0.8, 0.2) - - C ~ Categorical(0.5, 0.5) if A = a1 and B = b1, else Categorical(0.75, 0.25) if A = a1 and B = b2, else Categorical(0.2, 0.8) if A = a1 and B = b3, else Categorical(1, 0) if A = a2 and B = b1, else Categorical(0, 1) if A = a2 and B = b2, else Categorical(0.01, 0.99) if A = a2 and B = b3 - - D ~ Categorical(0.25, 0.25, 0.25, 0.25) if C = c1, else Categorical(0.7, 0, 0.15, 0.15) if C = c2 + - B ~ Categorical(0.33, 0.33, 0.34) if A = A1, else Categorical(0, 0.8, 0.2) + - C ~ Categorical(0.5, 0.5) if A = A1 and B = B1, else Categorical(0.75, 0.25) if A = A1 and B = B2, else Categorical(0.2, 0.8) if A = A1 and B = B3, else Categorical(1, 0) if A = A2 and B = B1, else Categorical(0, 1) if A = A2 and B = B2, else Categorical(0.01, 0.99) if A = A2 and B = B3 + - D ~ Categorical(0.25, 0.25, 0.25, 0.25) if C = C1, else Categorical(0.7, 0, 0.15, 0.15) if C = C2 Args: size (int): The sample size. @@ -136,60 +136,54 @@ def generate_discrete_data(size: int, seed: int = SEED) -> pd.DataFrame: d_dict = np.asarray(["D1", "D2", "D3", "D4"]) a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] - b_values = np.empty_like(a_values) - c_values = np.empty_like(a_values) - d_values = np.empty_like(a_values) + b_values = np.empty(size, dtype=object) + c_values = np.empty(size, dtype=object) + d_values = np.empty(size, dtype=object) # Indices a1_indices = a_values == "A1" + a2_indices = a_values == "A2" - a1b1_indices = np.logical_and(a_values == "A1", b_values == "B1") - a1b2_indices = np.logical_and(a_values == "A1", b_values == "B2") - a1b3_indices = np.logical_and(a_values == "A1", b_values == "B3") - a2b1_indices = np.logical_and(a_values == "A2", b_values == "B1") - a2b2_indices = np.logical_and(a_values == "A2", b_values == "B2") - a2b3_indices = np.logical_and(a_values == "A2", b_values == "B3") - - c1_indices = c_values == "C1" - c2_indices = c_values == "C2" - - # Sampling + # Sampling B b_values[a1_indices] = b_dict[ - np.random.choice(b_dict.size, np.sum(a1_indices), p=[0.33, 0.33, 0.34]) + np.random.choice(b_dict.size, a1_indices.sum(), p=[0.33, 0.33, 0.34]) ] - b_values[~a1_indices] = b_dict[ - np.random.choice(b_dict.size, np.sum(~a1_indices), p=[0, 0.8, 0.2]) + b_values[a2_indices] = b_dict[ + np.random.choice(b_dict.size, a2_indices.sum(), p=[0, 0.8, 0.2]) ] - c_values[a1b1_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a1b1_indices), p=[0.5, 0.5]) - ] - c_values[a1b2_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a1b2_indices), p=[0.75, 0.25]) - ] - c_values[a1b3_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a1b3_indices), p=[0.2, 0.8]) - ] - c_values[a2b1_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a2b1_indices), p=[1, 0]) - ] - c_values[a2b2_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a2b2_indices), p=[0, 1]) - ] - c_values[a2b3_indices] = c_dict[ - np.random.choice(c_dict.size, np.sum(a2b3_indices), p=[0.01, 0.99]) - ] - - d_values[c1_indices] = d_dict[ - np.random.choice(d_dict.size, np.sum(c1_indices), p=[0.25, 0.25, 0.25, 0.25]) - ] - d_values[c2_indices] = d_dict[ - np.random.choice(d_dict.size, np.sum(c2_indices), p=[0.7, 0, 0.15, 0.15]) - ] + # Sampling C + for i in range(size): + if a_values[i] == "A1" and b_values[i] == "B1": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.5, 0.5])] + elif a_values[i] == "A1" and b_values[i] == "B2": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.75, 0.25])] + elif a_values[i] == "A1" and b_values[i] == "B3": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.2, 0.8])] + elif a_values[i] == "A2" and b_values[i] == "B1": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[1, 0])] + elif a_values[i] == "A2" and b_values[i] == "B2": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0, 1])] + elif a_values[i] == "A2" and b_values[i] == "B3": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.01, 0.99])] + + # Sampling D + for i in range(size): + if c_values[i] == "C1": + d_values[i] = d_dict[ + np.random.choice(d_dict.size, p=[0.25, 0.25, 0.25, 0.25]) + ] + elif c_values[i] == "C2": + d_values[i] = d_dict[np.random.choice(d_dict.size, p=[0.7, 0, 0.15, 0.15])] # DataFrame df = pd.DataFrame( - {"A": a_values, "B": b_values, "C": c_values, "D": d_values}, dtype="category" + { + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": pd.Series(c_values, dtype="category"), + "D": pd.Series(d_values, dtype="category"), + } ) return df @@ -197,10 +191,10 @@ def generate_discrete_data(size: int, seed: int = SEED) -> pd.DataFrame: def generate_discrete_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of discrete data with uniform distributions. The relationships are as follows: - - A ~ Categorical(a1, a2) - - B ~ Categorical(b1, b2, b3) - - C ~ Categorical(c1, c2) - - D ~ Categorical(d1, d2, d3, d4) + - A ~ Categorical(A1, A2) + - B ~ Categorical(B1, B2, B3) + - C ~ Categorical(C1, C2) + - D ~ Categorical(D1, D2, D3, D4) Args: size (int): The sample size. @@ -234,9 +228,9 @@ def generate_hybrid_data(size: int, seed: int = SEED) -> pd.DataFrame: """Generates a DataFrame of hybrid data with discrete and continuous variables. The relationships are as follows: - A ~ Categorical(0.75, 0.25) - - B ~ Categorical(0.3, 0.4, 0.3) if A = a1, else Categorical(0.2, 0.5, 0.3) + - B ~ Categorical(0.3, 0.4, 0.3) if A = A1, else Categorical(0.2, 0.5, 0.3) - C ~ N(-4.2, 0.75) - - D ~ N(1, 0.75) if A = a1 and B = b1, else N(-2 + C, 2) if A = a1 and B = b2, else N(-1 + 3 * C, 0.25) if A = a1 and B = b3, else N(2, 1) if A = a2 and B = b1, else N(3.5 - 1.2 * C, 1) if A = a2 and B = b2, else N(4.8 - 2 * C, 1.5) if A = a2 and B = b3 + - D ~ N(1, 0.75) if A = A1 and B = B1, else N(-2 + C, 2) if A = A1 and B = B2, else N(-1 + 3 * C, 0.25) if A = A1 and B = B3, else N(2, 1) if A = A2 and B = B1, else N(3.5 - 1.2 * C, 1) if A = A2 and B = B2, else N(4.8 - 2 * C, 1.5) if A = A2 and B = B3 Args: size (int): The sample size. @@ -456,8 +450,8 @@ def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataF The relationships are as follows: - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) - A ~ N(-4.2, 0.75) - - B ~ N(0, 0.25) if class = class1, else N(1, 0.5) if class = class2, else N(2, 1) if class = class3 - - C ~ N(-2 + 2 * B, 1) if class = class1, else N(1 + 0.5 * B, 0.5) if class = class2, else N(3 + 3 * B, 0.25) if class = class3 + - B ~ N(0, 0.25) if TRUE_CLASS_LABEL = class1, else N(1, 0.5) if TRUE_CLASS_LABEL = class2, else N(2, 1) if TRUE_CLASS_LABEL = class3 + - C ~ N(-2 + 2 * B, 1) if TRUE_CLASS_LABEL = class1, else N(1 + 0.5 * B, 0.5) if TRUE_CLASS_LABEL = class2, else N(3 + 3 * B, 0.25) if TRUE_CLASS_LABEL = class3 size (int): The sample seed (int, optional): The seed for random sampling. Defaults to 0. @@ -524,8 +518,8 @@ def generate_non_normal_data_classification( The relationships are as follows: - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) - A ~ U(0, 10) - - B ~ U(5, 15) if class = class1, else U(10, 20) if class = class2, else U(15, 25) if class = class3 - - C ~ sin(A) + cos(B) + U(-1, 1) if class = class1, else exp(A / 10) + log(B + 1) + U(-0.5, 0.5) if class = class2, else A * B + U(-2, 2) if class = class3 + - B ~ U(5, 15) if TRUE_CLASS_LABEL = class1, else U(10, 20) if TRUE_CLASS_LABEL = class2, else U(15, 25) if TRUE_CLASS_LABEL = class3 + - C ~ sin(A) + cos(B) + U(-1, 1) if TRUE_CLASS_LABEL = class1, else exp(A / 10) + log(B + 1) + U(-0.5, 0.5) if TRUE_CLASS_LABEL = class2, else A * B + U(-2, 2) if TRUE_CLASS_LABEL = class3 Args: size (int): The sample size. From 696dfa4c1e45e57eab26d91df3d11f49fd54a476 Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Fri, 23 May 2025 10:19:22 +0000 Subject: [PATCH 70/75] chore: update version to 0.5.2 in configuration files --- docs/source/conf.py | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index cf13de10..418e188b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,8 +22,8 @@ author = "David Atienza, Carlos Li Hu" # The full version, including alpha/beta/rc tags -version = "0.5.1" -release = "0.5.1" +version = "0.5.2" +release = "0.5.2" # -- General configuration --------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 6846b4a4..4d4b6f57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ authors = [ { name = "Carlos Li Hu", email = "carloslihu96@gmail.com" }, ] description = "PyBNesian is a Python package that implements Bayesian networks." -version = "0.5.1" +version = "0.5.2" readme = { file = "README.md", content-type = "text/markdown" } license = { file = "LICENSE" } requires-python = ">=3.8" From 788d922edca4df248fe1980a47b3c131ea31c90e Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 27 May 2025 06:34:13 +0000 Subject: [PATCH 71/75] refactor: copilot documentation for KDE_test.py --- tests/factors/continuous/KDE_test.py | 87 +++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index dec89299..3d50ab91 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -10,7 +10,13 @@ df_float = df.astype("float32") -def test_check_type(): +def test_check_type() -> None: + """ + Tests that the KDE factor raises a ValueError when the data type of the test dataset + is different from the data type of the training dataset during log-likelihood and + smoothed log-likelihood computations. + """ + cpd = pbn.KDE(["A"]) cpd.fit(df) with pytest.raises(ValueError) as ex: @@ -30,12 +36,28 @@ def test_check_type(): def test_kde_variables(): + """ + Tests the initialization of the KDE class with different sets of variables. + For each list of variable names, this test creates a KDE object and asserts + that the object's variables match the input list. This ensures that the KDE + class correctly stores and returns its variables upon initialization. + """ + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: cpd = pbn.KDE(variables) assert cpd.variables() == variables def test_kde_bandwidth(): + """ + Tests the bandwidth selection and assignment functionality of the KDE class. + This test verifies: + - That the KDE bandwidth computed using the normal reference rule matches the output of scipy's gaussian_kde with a custom bandwidth method, for various variable sets and sample sizes. + - That the KDE bandwidth computed using Scott's rule matches the output of scipy's gaussian_kde default bandwidth, for various variable sets and sample sizes. + - That the bandwidth attribute of the KDE object can be manually set and correctly reflects the assigned value. + The test is performed for both integer and float dataframes. + """ + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 1000, 10000]: npdata = df.loc[:, variables].to_numpy() @@ -81,6 +103,28 @@ def test_kde_bandwidth(): class UnitaryBandwidth(pbn.BandwidthSelector): + """ + A bandwidth selector that returns the identity matrix as the bandwidth. + This class is a subclass of `pbn.BandwidthSelector` and implements a simple bandwidth selection strategy + where the bandwidth matrix is always the identity matrix of size equal to the number of variables. + Methods + ------- + __init__(): + Initializes the UnitaryBandwidth selector. + bandwidth(df, variables): + Returns the identity matrix of shape (len(variables), len(variables)) as the bandwidth matrix. + Parameters + ---------- + df : pandas.DataFrame + The data frame containing the data (not used in this selector). + variables : list + The list of variables for which the bandwidth is to be computed. + Returns + ------- + numpy.ndarray + An identity matrix of size equal to the number of variables. + """ + def __init__(self): pbn.BandwidthSelector.__init__(self) @@ -89,6 +133,16 @@ def bandwidth(self, df, variables): def test_kde_new_bandwidth(): + """ + Tests the behavior of the KDE class when using the UnitaryBandwidth bandwidth selector. + This test verifies that: + - When fitting a KDE with a single variable, the resulting bandwidth matrix is the 1x1 identity matrix. + - When fitting a KDE with four variables, the resulting bandwidth matrix is the 4x4 identity matrix. + - The behavior is consistent for both integer and float dataframes. + Assertions: + - The bandwidth matrix after fitting is as expected (identity matrix) for both data types and variable counts. + """ + kde = pbn.KDE(["A"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.eye(1) @@ -105,6 +159,14 @@ def test_kde_new_bandwidth(): def test_kde_data_type(): + """ + Tests the `data_type` method of the KDE factor. + This test verifies that: + - Calling `data_type` before fitting the KDE raises a ValueError with the message "KDE factor not fitted". + - After fitting the KDE with a DataFrame `df`, the returned data type is `pa.float64()`. + - After fitting the KDE with a DataFrame `df_float`, the returned data type is `pa.float32()`. + """ + k = pbn.KDE(["A"]) with pytest.raises(ValueError) as ex: @@ -118,6 +180,19 @@ def test_kde_data_type(): def test_kde_fit(): + """ + Tests the fitting process of the KDE (Kernel Density Estimation) class in the PyBNesian library. + This test verifies that: + - The KDE object is not fitted before calling `fit`. + - After fitting with a subset of the provided DataFrame, the KDE object is marked as fitted. + - The number of training instances and variables in the fitted KDE matches those of a reference `scipy.stats.gaussian_kde` object. + - The test is performed for different combinations of variables and different numbers of training instances, using both integer and float DataFrames. + Tested scenarios: + - Single and multiple variable KDEs. + - Different sample sizes (50, 150, 500). + - Both integer and float data types. + """ + def _test_kde_fit_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() @@ -141,6 +216,16 @@ def _test_kde_fit_iter(variables, _df, instances): def test_kde_fit_null(): + """ + Test the fitting of the KDE (Kernel Density Estimator) model when input data contains null (NaN) values. + This test verifies that: + - The KDE model is not fitted before calling `fit` and is fitted after. + - The model correctly ignores rows with null values during fitting. + - The number of training instances and variables in the fitted model matches those in a reference `scipy.stats.gaussian_kde` fitted on the same data with nulls removed. + - The computed bandwidth (covariance) of the KDE matches that of the reference implementation. + The test is performed for different combinations of variables and different numbers of training instances, using both integer and float dataframes with randomly inserted NaN values. + """ + def _test_kde_fit_null_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() From dd8208262a7ca214883662db367dfa600821e86f Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Mon, 23 Jun 2025 18:02:10 +0200 Subject: [PATCH 72/75] gamma approx fix --- .../independences/hybrid/mixed_knncmi.cpp | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index e3044301..225b28d7 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -364,14 +364,6 @@ double compute_variance(const std::vector& data, double mean) { return variance / data.size(); } -double compute_skewness(const std::vector& data, double mean, double variance) { - double skewness = 0.0; - for (double x : data) { - skewness += std::pow(x - mean, 3); - } - return (skewness / data.size()) / std::pow(variance, 1.5); -} - double compute_pvalue(double original_mi, std::vector& permutation_stats, bool gamma_approx) { double min_value = *std::min_element(permutation_stats.begin(), permutation_stats.end()); double max_value = *std::max_element(permutation_stats.begin(), permutation_stats.end()); @@ -393,7 +385,6 @@ double compute_pvalue(double original_mi, std::vector& permutation_stats double mean = compute_mean(permutation_stats); double variance = compute_variance(permutation_stats, mean); - double skewness = compute_skewness(permutation_stats, mean, variance); double shape, scale; shape = (mean * mean) / variance; @@ -403,11 +394,7 @@ double compute_pvalue(double original_mi, std::vector& permutation_stats boost::math::gamma_distribution<> gamma_dist(shape, scale); // use the fitted gamma distribution to compute the p-value - if (skewness > 0) { - return 1 - boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); - } - - return boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); + return 1 - boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); } // crude p-value computation From 965b4794418f985bd2b1d188336bf34984e0ef4f Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Fri, 27 Jun 2025 00:30:19 +0200 Subject: [PATCH 73/75] Fixed bug --- pybnesian/learning/independences/hybrid/mixed_knncmi.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index 225b28d7..db15d27b 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -124,6 +124,10 @@ double mi_general(VPTree& ztree, for (auto i = 0; i < n_rows; ++i) { eps(i) = knn_results[i].first(k); k_hat(i) = knn_results[i].second.size(); + if (k == 1 && eps(i) == 1.0) { + k_hat(i) = 1; + eps(i) = 0.0; + } } // use the ztree to search in all Z, XZ and YZ subspaces @@ -159,6 +163,10 @@ double mi_pair(VPTree& ytree, for (auto i = 0; i < n_rows; ++i) { eps(i) = knn_results[i].first[k]; k_hat(i) = knn_results[i].second.size(); + if (k == 1 && eps(i) == 1.0) { + k_hat(i) = 1; + eps(i) = 0.0; + } } auto x_is_discrete_column = std::vector(is_discrete_column.begin(), is_discrete_column.begin() + 1); From 87ccd0bd28a9d078c4c0fa2b7c4ca90e368e7968 Mon Sep 17 00:00:00 2001 From: JuanFPR-UPM Date: Thu, 23 Oct 2025 14:55:03 +0200 Subject: [PATCH 74/75] Parallelized implementation --- .../independences/hybrid/mixed_knncmi.cpp | 85 ++-- .../independences/hybrid/mixed_knncmi.hpp | 18 +- pybnesian/vptree/vptree.cpp | 397 ++++++++++++------ pybnesian/vptree/vptree.hpp | 7 +- 4 files changed, 337 insertions(+), 170 deletions(-) diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp index db15d27b..16a4312b 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -1,8 +1,10 @@ #include #include #include +#include #include #include +#include using Array_ptr = std::shared_ptr; using vptree::hash_columns; @@ -121,10 +123,11 @@ double mi_general(VPTree& ztree, VectorXd eps(n_rows); VectorXi k_hat(n_rows); + for (auto i = 0; i < n_rows; ++i) { eps(i) = knn_results[i].first(k); k_hat(i) = knn_results[i].second.size(); - if (k == 1 && eps(i) == 1.0) { + if (k == 1 && eps(i) == std::numeric_limits::infinity()) { k_hat(i) = 1; eps(i) = 0.0; } @@ -136,6 +139,7 @@ double mi_general(VPTree& ztree, double res = 0; auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; +#pragma omp parallel for reduction(+ : res) for (int i = 0; i < n_rows; ++i) { res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(exclude_self(n_z(i))) - boost::math::digamma(exclude_self(n_xz(i))) - boost::math::digamma(exclude_self(n_yz(i))); @@ -160,10 +164,11 @@ double mi_pair(VPTree& ytree, VectorXd eps(n_rows); VectorXi k_hat(n_rows); + for (auto i = 0; i < n_rows; ++i) { eps(i) = knn_results[i].first[k]; k_hat(i) = knn_results[i].second.size(); - if (k == 1 && eps(i) == 1.0) { + if (k == 1 && eps(i) == std::numeric_limits::infinity()) { k_hat(i) = 1; eps(i) = 0.0; } @@ -183,6 +188,7 @@ double mi_pair(VPTree& ytree, double res = 0; auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; +#pragma omp parallel for reduction(+ : res) for (int i = 0; i < n_rows; ++i) { // Z is treated as a constant column, thus n_z = n_rows - 1 res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(n_rows - 1) - @@ -220,7 +226,7 @@ int MixedKMutualInformation::find_minimum_shuffled_cluster_size(const DataFrame& switch (m_datatype->id()) { case Type::FLOAT: { auto data = shuffled_df.downcast_vector(discrete_vars); - auto hashed_cols = hash_columns(data, discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars, true); for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { joint_counts[hashed_cols[i]]++; } @@ -228,7 +234,7 @@ int MixedKMutualInformation::find_minimum_shuffled_cluster_size(const DataFrame& } default: { auto data = shuffled_df.downcast_vector(discrete_vars); - auto hashed_cols = hash_columns(data, discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars, true); for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { joint_counts[hashed_cols[i]]++; } @@ -372,47 +378,78 @@ double compute_variance(const std::vector& data, double mean) { return variance / data.size(); } +double compute_skewness(const std::vector& data, double mean, double variance) { + double skewness = 0.0; + + for (double x : data) { + skewness += std::pow(x - mean, 3); + } + + return (skewness / data.size()) / std::pow(variance, 1.5); +} double compute_pvalue(double original_mi, std::vector& permutation_stats, bool gamma_approx) { double min_value = *std::min_element(permutation_stats.begin(), permutation_stats.end()); double max_value = *std::max_element(permutation_stats.begin(), permutation_stats.end()); if (original_mi > max_value) { - return 0.0; + return 1.0 / static_cast((permutation_stats.size() + 1)); } else if (original_mi <= min_value) { return 1.0; } if (gamma_approx) { - // small positive value to ensure positivity - double epsilon = std::numeric_limits::epsilon(); - std::vector shifted_data; - // shift statistics to the positive interval - for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { - permutation_stats[i] = permutation_stats[i] - min_value + epsilon; - } + // include unpermuted statistic for conservative p-value estimation + permutation_stats.push_back(original_mi); double mean = compute_mean(permutation_stats); double variance = compute_variance(permutation_stats, mean); + double skewness = compute_skewness(permutation_stats, mean, variance); - double shape, scale; - shape = (mean * mean) / variance; - scale = variance / mean; + // standardise to mu=0 std=1 to fit a Pearson type III PDF (Minas & Montana, 2014) + for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { + permutation_stats[i] = (permutation_stats[i] - mean) / std::sqrt(variance); + } - // fit gamma using method of moments - boost::math::gamma_distribution<> gamma_dist(shape, scale); + auto z_value = permutation_stats.back(); + permutation_stats.pop_back(); - // use the fitted gamma distribution to compute the p-value - return 1 - boost::math::cdf(gamma_dist, original_mi - min_value + epsilon); + if (skewness == 0.0) { + // Standard normal distribution + boost::math::normal_distribution<> standard_normal(0.0, 1.0); + return boost::math::cdf(boost::math::complement(standard_normal, z_value)); + } + double k, theta, c; + k = 4 / std::pow(skewness, 2); // shape + theta = skewness / 2.0; // scale + c = -2.0 / skewness; // location shift + + auto x_value = (z_value - c) / theta; + + // fit gamma using method of moments to compute the p-value + + if (skewness > 0) { + if (x_value >= util::machine_tol) { // practically 0, but avoids convergence timeouts + return boost::math::gamma_q<>(k, x_value); // upper tail + } + + return 1.0; // outside gamma support + } + + else if (x_value >= util::machine_tol) { + return boost::math::gamma_p<>(k, x_value); // lower tail + } + + return 1.0 / static_cast((permutation_stats.size() + 1)); // outside gamma support } - // crude p-value computation - int count_greater = 0; + // crude Monte Carlo p-value computation + int count_greater = 1; for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { if (permutation_stats[i] >= original_mi) ++count_greater; } - return static_cast(count_greater) / permutation_stats.size(); + return static_cast(count_greater) / static_cast((permutation_stats.size() + 1)); } double MixedKMutualInformation::pvalue(const std::string& x, const std::string& y) const { @@ -607,7 +644,7 @@ double MixedKMutualInformation::shuffled_pvalue(double original_mi, for (int i = 0; i < m_samples; ++i) { std::shuffle(order.begin(), order.end(), rng); shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); - // we compute the adaptive k only if X is discrete + // we recompute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); k = std::min(k, min_cluster_size - 1); @@ -630,7 +667,7 @@ double MixedKMutualInformation::shuffled_pvalue(double original_mi, for (int i = 0; i < m_samples; ++i) { std::shuffle(order.begin(), order.end(), rng); shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); - // we compute the adaptive k only if X is discrete + // we recompute the adaptive k only if X is discrete if (is_discrete_column[0] && m_adaptive_k) { auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); k = std::min(k, min_cluster_size - 1); diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp index c15baf02..10346ea1 100644 --- a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -32,14 +32,14 @@ double mi_pair(VPTree& ytree, class MixedKMutualInformation : public IndependenceTest { public: MixedKMutualInformation(DataFrame df, - int k, - unsigned int seed = std::random_device{}(), - int shuffle_neighbors = 5, - int samples = 1000, - std::string scaling = "min_max", - bool gamma_approx = true, - bool adaptive_k = true, - int tree_leafsize = 16) + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) : m_df(df), m_scaled_df(scale_data(df, scaling)), m_datatype(), @@ -84,7 +84,7 @@ class MixedKMutualInformation : public IndependenceTest { int find_minimum_cluster_size(const std::vector& discrete_vars) const; int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, - const std::vector& discrete_vars) const; + const std::vector& discrete_vars) const; DataFrame m_df; DataFrame m_scaled_df; std::shared_ptr m_datatype; diff --git a/pybnesian/vptree/vptree.cpp b/pybnesian/vptree/vptree.cpp index 7b4e008e..8d1ad73c 100644 --- a/pybnesian/vptree/vptree.cpp +++ b/pybnesian/vptree/vptree.cpp @@ -1,4 +1,5 @@ #include +#include namespace vptree { @@ -76,13 +77,12 @@ std::unique_ptr build_vptree(const HybridChebyshevDistance& p) { - return p.first == 1; // Check if any distance is 1 + return p.first == std::numeric_limits::infinity(); // Check if any distance is infinity }); // prioritize discrete splits - double threshold = 1.0; + CType threshold = 1.0; if (it == distances_indices.end()) { // if none, node radius is the median @@ -96,10 +96,9 @@ std::unique_ptr build_vptree(const HybridChebyshevDistance indices_left, indices_right; - /*notice how placing the >= on the right child does not affect continuous splits, - but significantly improves the discrete splits, which are binary {0,1}*/ + // follow convention for left child, contains neighbors within the radius and on the hypersphere surface for (size_t i = 0; i < distances_indices.size(); ++i) { - if (distances_indices[i].first < threshold) { + if (distances_indices[i].first <= threshold) { indices_left.push_back(distances_indices[i].second); } else { indices_right.push_back(distances_indices[i].second); @@ -126,6 +125,7 @@ std::unique_ptr VPTree::build_vptree(const DataFrame& df, std::vector indices(m_df->num_rows()); std::iota(indices.begin(), indices.end(), 0); std::mt19937 rng{seed}; + switch (datatype->id()) { case Type::DOUBLE: { auto data = df.downcast_vector(); @@ -153,25 +153,52 @@ std::vector> VPTree::query(const DataFrame& test_d std::vector> res(test_df->num_rows()); + // only for fully discrete data + auto cache_values = + std::all_of(m_is_discrete_column.begin(), m_is_discrete_column.end(), [](bool val) { return val; }); + + auto num_rows = test_df->num_rows(); + switch (m_datatype->id()) { case Type::FLOAT: { auto test = test_df.downcast_vector(); HybridChebyshevDistance dist(test, m_is_discrete_column); + auto hash_keys = hash_columns(test, m_column_names, cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < num_rows; ++i) { + auto key = hash_keys[i]; + bool skip_query = false; +#pragma omp critical + { + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + skip_query = true; + res[i] = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto t = query_instance(i, k, dist); + res[i] = t; + +#pragma omp critical + { + m_query_cache[key] = t; + } + } + } - auto hash_keys = hash_columns(test, m_column_names); - - for (int i = 0; i < test_df->num_rows(); ++i) { - auto key = hash_keys[i]; - - auto it = m_query_cache.find(key); - if (it != m_query_cache.end()) { - res[i] = it->second; - // Skip the query, use cached result - } else { - auto t = query_instance(i, k, dist); - res[i] = t; - - m_query_cache[key] = t; + else { +#pragma omp for + for (int i = 0; i < num_rows; ++i) { + auto t = query_instance(i, k, dist); + res[i] = t; + } } } @@ -182,31 +209,51 @@ std::vector> VPTree::query(const DataFrame& test_d auto test = test_df.downcast_vector(); HybridChebyshevDistance dist(test, m_is_discrete_column); - - auto hash_keys = hash_columns(test, m_column_names); - for (int i = 0; i < test_df->num_rows(); ++i) { - auto key = hash_keys[i]; - - auto it = m_query_cache.find(key); - if (it != m_query_cache.end()) { - res[i] = it->second; - // Skip the query, use cached result + auto hash_keys = hash_columns(test, m_column_names, cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < num_rows; ++i) { + auto key = hash_keys[i]; + bool skip_query = false; +#pragma omp critical + { + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + skip_query = true; + res[i] = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto t = query_instance(i, k, dist); + res[i] = t; + +#pragma omp critical + { + m_query_cache[key] = t; + } + } } else { - auto t = query_instance(i, k, dist); - res[i] = t; - - m_query_cache[key] = t; +#pragma omp for + for (int i = 0; i < num_rows; ++i) { + auto t = query_instance(i, k, dist); + res[i] = t; + } } } } } - // cleared because after permuting X the XYZ space will not be the same - m_query_cache.clear(); + if (cache_values) { + // cleared because after permuting X the XYZ space will not be the same + m_query_cache.clear(); + } return res; } - std::tuple VPTree::count_ball_subspaces(const DataFrame& test_df, const VectorXd& eps, std::vector& is_discrete_column) const { @@ -217,68 +264,117 @@ std::tuple VPTree::count_ball_subspaces(const Data VectorXi count_yz(n_rows); VectorXi count_z(n_rows); + // only for fully discrete data + auto cache_values = std::all_of(is_discrete_column.begin(), is_discrete_column.end(), [](bool val) { return val; }); + switch (m_datatype->id()) { case Type::FLOAT: { auto test = test_df.downcast_vector(); HybridChebyshevDistance distance_xyz(test, is_discrete_column); - auto hash_keys = hash_columns(test, test_df.column_names()); - - for (int i = 0; i < n_rows; ++i) { - auto key = hash_keys[i]; - - boost::hash_combine(key, eps(i)); - - auto it = m_count_cache.find(key); - if (it != m_count_cache.end()) { - count_xz(i) = std::get<0>(it->second); - count_yz(i) = std::get<1>(it->second); - count_z(i) = std::get<2>(it->second); - // Skip the query, use cached result + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + skip_query = true; + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + +#pragma omp critical + { + m_count_cache[key] = c; + } + } } else { - auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); - - count_xz(i) = std::get<0>(c); - count_yz(i) = std::get<1>(c); - count_z(i) = std::get<2>(c); +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); - m_count_cache[key] = c; + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + } } } + break; } default: { auto test = test_df.downcast_vector(); HybridChebyshevDistance distance_xyz(test, is_discrete_column); - - auto hash_keys = hash_columns(test, test_df.column_names()); - - for (int i = 0; i < n_rows; ++i) { - auto key = hash_keys[i]; - - boost::hash_combine(key, eps(i)); - - auto it = m_count_cache.find(key); - if (it != m_count_cache.end()) { - count_xz(i) = std::get<0>(it->second); - count_yz(i) = std::get<1>(it->second); - count_z(i) = std::get<2>(it->second); - // Skip the query, use cached result + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + skip_query = true; + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + +#pragma omp critical + { + m_count_cache[key] = c; + } + } } else { - auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); - count_xz(i) = std::get<0>(c); - count_yz(i) = std::get<1>(c); - count_z(i) = std::get<2>(c); - - m_count_cache[key] = c; + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + } } } } } - // cleared because after permuting X the XYZ space will not be the same - m_count_cache.clear(); + if (cache_values) { + // cleared because after permuting X the XYZ space will not be the same + m_count_cache.clear(); + } return std::make_tuple(count_xz, count_yz, count_z); } @@ -286,25 +382,25 @@ std::tuple VPTree::count_ball_subspaces(const Data template std::vector vptree::hash_columns( const std::vector::ArrayType>>& data, - std::vector column_names) { + std::vector column_names, + bool discrete_data) { int num_rows = data.empty() ? 0 : data[0]->length(); std::vector row_hashes(num_rows, 0); - size_t colnames_hash = boost::hash_range(column_names.begin(), column_names.end()); - - for (long unsigned int j = 0; j < data.size(); ++j) { + if (discrete_data) { + size_t colnames_hash = boost::hash_range(column_names.begin(), column_names.end()); +#pragma omp parallel for for (int i = 0; i < num_rows; ++i) { - auto value = data[j]->Value(i); - - boost::hash_combine(row_hashes[i], value); + size_t h = 0; // local hash for row i + for (size_t j = 0; j < data.size(); ++j) { + auto value = data[j]->Value(i); + boost::hash_combine(h, value); + } + boost::hash_combine(h, colnames_hash); + row_hashes[i] = h; } } - // column names are needed as the discrete values are all dummy dictionary keys 0,1,2... - for (int i = 0; i < num_rows; ++i) { - boost::hash_combine(row_hashes[i], colnames_hash); - } - return row_hashes; } @@ -316,29 +412,52 @@ VectorXi VPTree::count_ball_unconditional(const DataFrame& test_df, auto n_rows = test_df->num_rows(); VectorXi count_n(n_rows); + // only for fully discrete data + auto cache_values = std::all_of(is_discrete_column.begin(), is_discrete_column.end(), [](bool val) { return val; }); + switch (m_datatype->id()) { case Type::FLOAT: { auto test = test_df.downcast_vector(); HybridChebyshevDistance distance(test, is_discrete_column); + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + skip_query = true; + count_n(i) = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + +#pragma omp critical + { + m_count_cache_unconditional[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_unconditional_instance(i, eps(i), distance); - auto hash_keys = hash_columns(test, test_df.column_names()); - - for (int i = 0; i < n_rows; ++i) { - auto key = hash_keys[i]; - - boost::hash_combine(key, eps(i)); - - auto it = m_count_cache_unconditional.find(key); - if (it != m_count_cache_unconditional.end()) { - count_n(i) = it->second; - continue; // Skip the query, use cached result + count_n(i) = c; + } } - - auto c = count_ball_unconditional_instance(i, eps(i), distance); - - count_n(i) = c; - - m_count_cache_unconditional[key] = c; } break; @@ -346,25 +465,44 @@ VectorXi VPTree::count_ball_unconditional(const DataFrame& test_df, default: { auto test = test_df.downcast_vector(); HybridChebyshevDistance distance(test, is_discrete_column); + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + skip_query = true; + count_n(i) = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; +#pragma omp critical + { + m_count_cache_unconditional[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_unconditional_instance(i, eps(i), distance); - auto hash_keys = hash_columns(test, test_df.column_names()); - - for (int i = 0; i < n_rows; ++i) { - auto key = hash_keys[i]; - - boost::hash_combine(key, eps(i)); - - auto it = m_count_cache_unconditional.find(key); - if (it != m_count_cache_unconditional.end()) { - count_n(i) = it->second; - continue; // Skip the query, use cached result + count_n(i) = c; + } } - - auto c = count_ball_unconditional_instance(i, eps(i), distance); - - count_n(i) = c; - - m_count_cache_unconditional[key] = c; } } } @@ -449,9 +587,6 @@ std::pair VPTree::query_instance(size_t i, // use triangular inequality to prune branches CType left_min_distance = distance_neigh - node->threshold; - // epsilon enforces inequality for discrete distances - if (left_min_distance == 0 && distance_neigh == 1) left_min_distance += std::numeric_limits::epsilon(); - if (node->left && left_min_distance <= distance_upper_bound) { query_nodes.push(QueryNode{node->left.get(), left_min_distance}); } @@ -522,8 +657,8 @@ std::tuple VPTree::count_ball_subspaces_instance( for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; ++it_neigh) { - // trick: since Z is a subspace of XZ and YZ, we can constrain the vptree building and search just to Z, - // then check for X&Y + // trick: since Z is a subspace of XZ and YZ, we can constrain the vptree building and + // search just to Z, then check for X&Y d_z = distance_xyz.distance_coords(*it_neigh, i, z_indices); if (d_z <= eps_value) { @@ -548,9 +683,6 @@ std::tuple VPTree::count_ball_subspaces_instance( // use triangular inequality to prune branches CType left_min_distance = d_z - node->threshold; - // epsilon enforces inequality for discrete distances - if (left_min_distance == 0 && d_z == 1) left_min_distance += std::numeric_limits::epsilon(); - if (node->left && left_min_distance <= eps_value) { query_nodes.push(QueryNode{node->left.get(), left_min_distance}); } @@ -615,9 +747,6 @@ int VPTree::count_ball_unconditional_instance(size_t i, // use triangular inequality to prune branches CType left_min_distance = distance_neigh - node->threshold; - // epsilon enforces inequality for discrete distances - if (left_min_distance == 0 && distance_neigh == 1) left_min_distance += std::numeric_limits::epsilon(); - if (node->left && left_min_distance <= eps_value) { query_nodes.push(QueryNode{node->left.get(), left_min_distance}); } diff --git a/pybnesian/vptree/vptree.hpp b/pybnesian/vptree/vptree.hpp index 5677567d..430a1128 100644 --- a/pybnesian/vptree/vptree.hpp +++ b/pybnesian/vptree/vptree.hpp @@ -15,7 +15,8 @@ namespace vptree { template std::vector hash_columns( const std::vector::ArrayType>>& data, - std::vector column_names); + std::vector column_names, + bool discrete_data); template class HybridChebyshevDistance { @@ -30,9 +31,9 @@ class HybridChebyshevDistance { m_operations_coords.reserve(m_data.size()); for (size_t i = 0; i < m_data.size(); ++i) { if (is_discrete_column[i]) { - // For discrete columns, Hamming distance + // For discrete columns, Hamming {0,inf} distance m_operations_coords.push_back([this, i](size_t p1_index, size_t p2_index) -> CType { - return (m_data[i]->Value(p1_index) != m_data[i]->Value(p2_index)); + return (m_data[i]->Value(p1_index) != m_data[i]->Value(p2_index)) ? std::numeric_limits::infinity() : 0.0; }); } else { // For continuous columns, Manhattan distance From e60517fcf216f607efe45d351ba56600ef73cb1d Mon Sep 17 00:00:00 2001 From: Carlos Li Hu Date: Tue, 4 Nov 2025 09:09:55 +0000 Subject: [PATCH 75/75] fix: update README with diagonal-bandwidth instructions --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 051c6efc..2154b6df 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![build](https://img.shields.io/github/actions/workflow/status/davenza/pybnesian/release.yml) +![build](https://img.shields.io/github/actions/workflow/status/carloslihu/pybnesian/release.yml) [![Documentation Status](https://readthedocs.org/projects/pybnesian/badge/?version=latest)](https://pybnesian.readthedocs.io/en/latest/?badge=latest) ![PyPI](https://img.shields.io/pypi/v/pybnesian?color=blue) @@ -304,9 +304,9 @@ Building Clone the repository: ``` -git clone https://github.com/davenza/PyBNesian.git +git clone https://github.com/carloslihu/PyBNesian.git cd PyBNesian -git checkout v0.5.1 # You can checkout a specific version if you want +git checkout feature/diagonal-bandwidth # Optional: checkout branch with diagonal bandwidth KDE pip install . ```