|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""generate_sbom.py config. Operational configuration values stored separately from the core code.""" |
| 3 | + |
| 4 | +import json |
| 5 | +import logging |
| 6 | +import re |
| 7 | + |
| 8 | +logger = logging.getLogger("generate_sbom") |
| 9 | +logger.setLevel(logging.NOTSET) |
| 10 | + |
| 11 | + |
| 12 | +# ################ Component Filters ################ |
| 13 | + |
| 14 | +# List of Endor Labs SBOM components that must be removed before processing |
| 15 | +endor_components_remove = [ |
| 16 | + # An incorrect match from parts of pkg:github/madler/zlib |
| 17 | + "zlib-ng/zlib-ng", |
| 18 | +] |
| 19 | + |
| 20 | +# bom-ref prefixes (Endor Labs has been changing them, so add all that we have seen) |
| 21 | +prefixes = [ |
| 22 | + "pkg:c/github.com/", |
| 23 | + "pkg:generic/github.com/", |
| 24 | + "pkg:github/", |
| 25 | +] |
| 26 | + |
| 27 | +for component in endor_components_remove: |
| 28 | + for prefix in prefixes: |
| 29 | + endor_components_remove.append(prefix + component) |
| 30 | + |
| 31 | +# ################ Component Renaming ################ |
| 32 | +# Endor does not have syntactically valid PURLs for C/C++ packages. |
| 33 | +# e.g., |
| 34 | +# Invalid: pkg:c/github.com/abseil/[email protected] |
| 35 | +# Valid: pkg:github/abseil/[email protected] |
| 36 | +# Run string replacements to correct for this: |
| 37 | +endor_components_rename = [ |
| 38 | + ["pkg:generic/zlib.net/zlib", "pkg:github/madler/zlib"], |
| 39 | + ["pkg:github/philsquared/clara", "pkg:github/catchorg/clara"], |
| 40 | + ["pkg:generic/github.com/", "pkg:github/"], |
| 41 | + ["pkg:c/github.com/", "pkg:github/"], |
| 42 | +] |
| 43 | + |
| 44 | +# ################ PURL Validation ################ |
| 45 | +REGEX_STR_PURL_OPTIONAL = ( # Optional Version (any chars except ? @ #) |
| 46 | + r"(?:@[^?@#]*)?" |
| 47 | + # Optional Qualifiers (any chars except @ #) |
| 48 | + r"(?:\?[^@#]*)?" |
| 49 | + # Optional Subpath (any chars) |
| 50 | + r"(?:#.*)?$" |
| 51 | +) |
| 52 | + |
| 53 | +REGEX_PURL = { |
| 54 | + # deb PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/deb-definition.md |
| 55 | + "deb": re.compile( |
| 56 | + r"^pkg:deb/" # Scheme and type |
| 57 | + # Namespace (organization/user), letters must be lowercase |
| 58 | + r"(debian|ubuntu)+" |
| 59 | + r"/" |
| 60 | + r"[a-z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name |
| 61 | + ), |
| 62 | + # Generic PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/generic-definition.md |
| 63 | + "generic": re.compile( |
| 64 | + r"^pkg:generic/" # Scheme and type |
| 65 | + r"([a-zA-Z0-9._-]+/)?" # Optional namespace segment |
| 66 | + r"[a-zA-Z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name (required) |
| 67 | + ), |
| 68 | + # GitHub PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/github-definition.md |
| 69 | + "github": re.compile( |
| 70 | + r"^pkg:github/" # Scheme and type |
| 71 | + # Namespace (organization/user), letters must be lowercase |
| 72 | + r"[a-z0-9-]+" |
| 73 | + r"/" |
| 74 | + r"[a-z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name (repository) |
| 75 | + ), |
| 76 | + # PyPI PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/pypi-definition.md |
| 77 | + "pypi": re.compile( |
| 78 | + r"^pkg:pypi/" # Scheme and type |
| 79 | + r"[a-z0-9_-]+" # Name, letters must be lowercase, dashes, underscore |
| 80 | + + REGEX_STR_PURL_OPTIONAL |
| 81 | + ), |
| 82 | +} |
| 83 | + |
| 84 | + |
| 85 | +def is_valid_purl(purl: str) -> bool: |
| 86 | + """Validate a GitHub or Generic PURL""" |
| 87 | + for purl_type, regex in REGEX_PURL.items(): |
| 88 | + if regex.match(purl): |
| 89 | + logger.debug(f"PURL: {purl} matched PURL type '{purl_type}' regex '{regex.pattern}'") |
| 90 | + return True |
| 91 | + return False |
| 92 | + |
| 93 | + |
| 94 | +# ################ Version Transformation ################ |
| 95 | + |
| 96 | +# In some cases we need to transform the version string to strip out tag-related text |
| 97 | +# It is unknown what patterns may appear in the future, so we have targeted (not broad) regex |
| 98 | +# This a list of 'pattern' and 'repl' inputs to re.sub() |
| 99 | +RE_VER_NUM = r"(0|[1-9]\d*)" |
| 100 | +RE_VER_LBL = r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?" |
| 101 | +RE_SEMVER = rf"{RE_VER_NUM}\.{RE_VER_NUM}\.{RE_VER_NUM}{RE_VER_LBL}" |
| 102 | +regex_semver = re.compile(RE_SEMVER) |
| 103 | + |
| 104 | +VERSION_PATTERN_REPL = [ |
| 105 | + # 'debian/1.28.1-1' pkg:github/mongodb/mongo-c-driver (temporary workaround) |
| 106 | + [re.compile(rf"^debian/({RE_SEMVER})-\d$"), r"\1"], |
| 107 | + # 'gperftools-2.9.1' pkg:github/gperftools/gperftools |
| 108 | + # 'mongo/v1.5.2' pkg:github/google/benchmark |
| 109 | + # 'mongodb-8.2.0-alpha2' pkg:github/wiredtiger/wiredtiger |
| 110 | + # 'release-1.12.0' pkg:github/apache/avro |
| 111 | + # 'yaml-cpp-0.6.3' pkg:github/jbeder/yaml-cpp |
| 112 | + [re.compile(rf"^[-a-z]+[-/][vr]?({RE_SEMVER})$"), r"\1"], |
| 113 | + # 'asio-1-34-2' pkg:github/chriskohlhoff/asio |
| 114 | + # 'cares-1_27_0' pkg:github/c-ares/c-ares |
| 115 | + [ |
| 116 | + re.compile(rf"^[a-z]+-{RE_VER_NUM}[_-]{RE_VER_NUM}[_-]{RE_VER_NUM}{RE_VER_LBL}$"), |
| 117 | + r"\1.\2.\3", |
| 118 | + ], |
| 119 | + # 'pcre2-10.40' pkg:github/pcre2project/pcre2 |
| 120 | + [re.compile(rf"^[a-z0-9]+-({RE_VER_NUM}\.{RE_VER_NUM})$"), r"\1"], |
| 121 | + # 'icu-release-57-1' pkg:github/unicode-org/icu |
| 122 | + [re.compile(rf"^[a-z]+-?[a-z]+-{RE_VER_NUM}-{RE_VER_NUM}$"), r"\1.\2"], |
| 123 | + # 'v2.6.0' pkg:github/confluentinc/librdkafka |
| 124 | + # 'r2.5.1' |
| 125 | + [re.compile(rf"^[rv]({RE_SEMVER})$"), r"\1"], |
| 126 | + # 'v2025.04.21.00' pkg:github/facebook/folly |
| 127 | + [re.compile(r"^v(\d+\.\d+\.\d+\.\d+)$"), r"\1"], |
| 128 | +] |
| 129 | + |
| 130 | + |
| 131 | +def get_semver_from_release_version(release_ver: str) -> str: |
| 132 | + """Extract the version number from string with tags or other annotations""" |
| 133 | + if release_ver: |
| 134 | + for re_obj, repl in VERSION_PATTERN_REPL: |
| 135 | + if re_obj.match(release_ver): |
| 136 | + return re_obj.sub(repl, release_ver) |
| 137 | + return release_ver |
| 138 | + |
| 139 | + |
| 140 | +# region special component use-case functions |
| 141 | + |
| 142 | + |
| 143 | +def get_version_from_wiredtiger_import_data(file_path: str) -> str: |
| 144 | + """Get the info in the 'import.data' file saved in the wiredtiger folder""" |
| 145 | + try: |
| 146 | + with open(file_path, "r") as input_json: |
| 147 | + import_data = input_json.read() |
| 148 | + result = json.loads(import_data) |
| 149 | + except Exception as e: |
| 150 | + logger.error(f"Error loading JSON file from {file_path}") |
| 151 | + logger.error(e) |
| 152 | + return None |
| 153 | + return result.get("commit") |
| 154 | + |
| 155 | + |
| 156 | +def get_version_sasl_from_workspace(file_path: str) -> str: |
| 157 | + """Determine the version that is pulled for Windows Cyrus SASL by searching WORKSPACE.bazel""" |
| 158 | + # e.g., |
| 159 | + # "https://s3.amazonaws.com/boxes.10gen.com/build/windows_cyrus_sasl-2.1.28.zip", |
| 160 | + try: |
| 161 | + with open(file_path, "r") as file: |
| 162 | + for line in file: |
| 163 | + if line.strip().startswith( |
| 164 | + '"https://s3.amazonaws.com/boxes.10gen.com/build/windows_cyrus_sasl-' |
| 165 | + ): |
| 166 | + return line.strip().split("windows_cyrus_sasl-")[1].split(".zip")[0] |
| 167 | + except Exception as e: |
| 168 | + logger.warning(f"Unable to load {file_path}") |
| 169 | + logger.warning(e) |
| 170 | + else: |
| 171 | + return None |
| 172 | + |
| 173 | + |
| 174 | +def process_component_special_cases( |
| 175 | + component_key: str, component: dict, versions: dict, repo_root: str |
| 176 | +) -> None: |
| 177 | + ## Special case for Cyrus SASL ## |
| 178 | + if component_key == "pkg:github/cyrusimap/cyrus-sasl": |
| 179 | + # Cycrus SASL is optionally loaded as a Windows library, when needed. There is no source code for Endor Labs to scan. |
| 180 | + # The version of Cyrus SASL that is used is defined in the WORKSPACE.bazel file: |
| 181 | + # "https://s3.amazonaws.com/boxes.10gen.com/build/windows_cyrus_sasl-2.1.28.zip", |
| 182 | + # Rather than add the complexity of Bazel queries to this script, we just search the text. |
| 183 | + |
| 184 | + versions["import_script"] = get_version_sasl_from_workspace(repo_root + "/WORKSPACE.bazel") |
| 185 | + logger.info( |
| 186 | + f"VERSION SPECIAL CASE: {component_key}: Found version '{versions['import_script']}' in 'WORKSPACE.bazel' file" |
| 187 | + ) |
| 188 | + |
| 189 | + ## Special case for wiredtiger ## |
| 190 | + elif component_key == "pkg:github/wiredtiger/wiredtiger": |
| 191 | + # MongoDB release branches import wiredtiger commits via a bot. These commits will likely not line up with a release or tag. |
| 192 | + # Endor labs will try to pull the nearest release/tag, but we want the more precise commit hash, which is stored in: |
| 193 | + # src/third_party/wiredtiget/import.data |
| 194 | + occurrences = component.get("evidence", {}).get("occurrences", []) |
| 195 | + if occurrences: |
| 196 | + location = occurrences[0].get("location") |
| 197 | + versions["import_script"] = get_version_from_wiredtiger_import_data( |
| 198 | + f"{repo_root}/{location}/import.data" |
| 199 | + ) |
| 200 | + logger.info( |
| 201 | + f"VERSION SPECIAL CASE: {component_key}: Found version '{versions['import_script']}' in 'import.data' file" |
| 202 | + ) |
| 203 | + |
| 204 | +# endregion special component use-case functions |
0 commit comments