From 7d705209d9369043e3814344ab11cb56c6a516bc Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Tue, 28 Jan 2025 19:56:24 -0500 Subject: [PATCH 1/4] Initialize C/C++ support feature branch. Signed-off-by: Rahul Krishna --- cldk/analysis/c/__init__.py | 20 ++ cldk/analysis/c/c_analysis.py | 376 ++++++++++++++++++++++++ cldk/analysis/c/clang/__init__.py | 23 ++ cldk/analysis/c/clang/clang_analyzer.py | 365 +++++++++++++++++++++++ cldk/analysis/common/__init__.py | 0 cldk/analysis/common/lsp/__init__.py | 0 cldk/analysis/common/lsp/lsp.py | 0 cldk/core.py | 5 +- cldk/models/c/__init__.py | 21 ++ cldk/models/c/models.py | 313 ++++++++++++++++++++ poetry.lock | 32 +- pyproject.toml | 4 +- 12 files changed, 1156 insertions(+), 3 deletions(-) create mode 100644 cldk/analysis/c/__init__.py create mode 100644 cldk/analysis/c/c_analysis.py create mode 100644 cldk/analysis/c/clang/__init__.py create mode 100644 cldk/analysis/c/clang/clang_analyzer.py create mode 100644 cldk/analysis/common/__init__.py create mode 100644 cldk/analysis/common/lsp/__init__.py create mode 100644 cldk/analysis/common/lsp/lsp.py create mode 100644 cldk/models/c/__init__.py create mode 100644 cldk/models/c/models.py diff --git a/cldk/analysis/c/__init__.py b/cldk/analysis/c/__init__.py new file mode 100644 index 0000000..e0553df --- /dev/null +++ b/cldk/analysis/c/__init__.py @@ -0,0 +1,20 @@ +################################################################################ +# Copyright IBM Corporation 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +C Analysis +""" +from .c_analysis import CAnalysis diff --git a/cldk/analysis/c/c_analysis.py b/cldk/analysis/c/c_analysis.py new file mode 100644 index 0000000..a5e38d4 --- /dev/null +++ b/cldk/analysis/c/c_analysis.py @@ -0,0 +1,376 @@ +################################################################################ +# Copyright IBM Corporation 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +Analysis model for C projects +""" + +from pathlib import Path +from typing import Dict, List, Optional + +from networkx import DiGraph + +from cldk.analysis.c.clang import ClangAnalyzer +from cldk.models.c import CApplication, CFunction, CTranslationUnit, CMacro, CTypedef, CStruct, CEnum, CVariable + + +class CAnalysis: + + def __init__(self, project_dir: Path) -> None: + """Initialization method for C Analysis backend.""" + if not isinstance(project_dir, Path): + project_dir = Path(project_dir) + self.c_application = self._init_application(project_dir) + + def _init_application(self, project_dir: Path) -> CApplication: + """Initializes the C application object. + + Args: + project_dir (Path): Path to the project directory. + + Returns: + CApplication: C application object. + """ + analyzer = ClangAnalyzer() + + # Analyze each file + translation_units = {} + for source_file in project_dir.rglob("*.c"): + tu = analyzer.analyze_file(source_file) + translation_units[str(source_file)] = tu + + # Create application model + return CApplication(translation_units=translation_units) + + def get_c_application(self) -> CApplication: + """Returns the C application object. + + Returns: + CApplication: C application object. + """ + return self.c_application + + def get_imports(self) -> List[str]: + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_variables(self, **kwargs): + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_application_view(self) -> CApplication: + return self.c_application + + def get_symbol_table(self) -> Dict[str, CTranslationUnit]: + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_compilation_units(self) -> List[CTranslationUnit]: + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def is_parsable(self, source_code: str) -> bool: + """ + Check if the code is parsable using clang parser. + Args: + source_code: source code + + Returns: + True if the code is parsable, False otherwise + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_call_graph(self) -> DiGraph: + """Returns the call graph of the C code. + + Returns: + DiGraph: The call graph of the C code. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_call_graph_json(self) -> str: + """Returns a serialized call graph in json. + + Raises: + NotImplementedError: Raised when this functionality is not suported. + + Returns: + str: Call graph in json. + """ + + raise NotImplementedError("Producing a call graph over a single file is not implemented yet.") + + def get_callers(self, function: CFunction) -> Dict: + """Returns a dictionary of callers of the target method. + + Args: + function (CFunction): A CFunction object. + + Raises: + NotImplementedError: Raised when this functionality is not suported. + + Returns: + Dict: A dictionary of callers of target function. + """ + + raise NotImplementedError("Generating all callers over a single file is not implemented yet.") + + def get_callees(self, function: CFunction) -> Dict: + """Returns a dictionary of callees in a fuction. + + Args: + function (CFunction): A CFunction object. + + Raises: + NotImplementedError: Raised when this functionality is not suported. + + Returns: + Dict: Dictionary with callee details. + """ + raise NotImplementedError("Generating all callees over a single file is not implemented yet.") + + def get_functions(self) -> Dict[str, CFunction]: + """Returns all functions in the project. + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + Dict[str, Dict[str, JCallable]]: Dictionary of dictionaries of all methods in the C code with qualified class name as key and dictionary of methods in that class. + """ + for _, translation_unit in self.c_application.translation_units.items(): + return translation_unit.functions + + def get_function(self, function_name: str, file_name: Optional[str]) -> CFunction | List[CFunction]: + """Returns a function object given the function name. + + Args: + function_name (str): The name of the function. + file_name (str): The name of the file containing the function. + + Returns: + CFunction: A method for the given qualified method name. If multiple functions with the same name exist, a list of functions is returned. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_C_file(self, file_name: str) -> str: + """Returns a class given qualified class name. + + Args: + file_name (str): The name of the file. + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + str: C file name containing the given qualified class. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_C_compilation_unit(self, file_path: str) -> CTranslationUnit: + """Given the path of a C source file, returns the compilation unit object from the symbol table. + + Args: + file_path (str): Absolute path to C source file + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + CTranslationUnit: Compilation unit object for C source file + """ + if self.analysis_backend in [AnalysisEngine.CODEQL, AnalysisEngine.TREESITTER]: + raise NotImplementedError("Support for this functionality has not been implemented yet.") + return self.backend.get_C_compilation_unit(file_path) + + def get_functions_in_file(self, file_name: str) -> List[CFunction]: + """Returns a dictionary of all methods of the given class. + + Args: + file_name (str): The name of the file. + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + Dict[str, JCallable]: A dictionary of all constructors of the given class. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_macros(self) -> List[CMacro]: + """Returns a list of all macros in the C code. + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + List[CMacro]: A list of all macros in the C code. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + def get_macros_in_file(self, file_name: str) -> List[CMacro] | None: + """Returns a list of all macros in the given file. + + Args: + file_name (str): The name of the file. + + Raises: + NotImplementedError: Raised when current AnalysisEngine does not support this function. + + Returns: + List[CMacro]: A list of all macros in the given file. Returns None if no macros are found. + """ + raise NotImplementedError("Support for this functionality has not been implemented yet.") + + +def get_includes(self) -> List[str]: + """Returns a list of all include statements across all files in the C code. + + Returns: + List[str]: A list of all include statements. Returns empty list if none found. + """ + all_includes = [] + for translation_unit in self.translation_units.values(): + all_includes.extend(translation_unit.includes) + return all_includes + + +def get_includes_in_file(self, file_name: str) -> List[str] | None: + """Returns a list of all include statements in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[str] | None: List of includes in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].includes + return None + + +def get_macros(self) -> List[CMacro]: + """Returns a list of all macro definitions across all files in the C code. + + Returns: + List[CMacro]: A list of all macro definitions. Returns empty list if none found. + """ + all_macros = [] + for translation_unit in self.translation_units.values(): + all_macros.extend(translation_unit.macros) + return all_macros + + +def get_macros_in_file(self, file_name: str) -> List[CMacro] | None: + """Returns a list of all macro definitions in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[CMacro] | None: List of macros in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].macros + return None + + +def get_typedefs(self) -> List[CTypedef]: + """Returns a list of all typedef declarations across all files in the C code. + + Returns: + List[CTypedef]: A list of all typedef declarations. Returns empty list if none found. + """ + all_typedefs = [] + for translation_unit in self.translation_units.values(): + all_typedefs.extend(translation_unit.typedefs) + return all_typedefs + + +def get_typedefs_in_file(self, file_name: str) -> List[CTypedef] | None: + """Returns a list of all typedef declarations in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[CTypedef] | None: List of typedefs in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].typedefs + return None + + +def get_structs(self) -> List[CStruct]: + """Returns a list of all struct/union declarations across all files in the C code. + + Returns: + List[CStruct]: A list of all struct/union declarations. Returns empty list if none found. + """ + all_structs = [] + for translation_unit in self.translation_units.values(): + all_structs.extend(translation_unit.structs) + return all_structs + + +def get_structs_in_file(self, file_name: str) -> List[CStruct] | None: + """Returns a list of all struct/union declarations in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[CStruct] | None: List of structs in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].structs + return None + + +def get_enums(self) -> List[CEnum]: + """Returns a list of all enum declarations across all files in the C code. + + Returns: + List[CEnum]: A list of all enum declarations. Returns empty list if none found. + """ + all_enums = [] + for translation_unit in self.translation_units.values(): + all_enums.extend(translation_unit.enums) + return all_enums + + +def get_enums_in_file(self, file_name: str) -> List[CEnum] | None: + """Returns a list of all enum declarations in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[CEnum] | None: List of enums in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].enums + return None + + +def get_globals(self, file_name: str) -> List[CVariable] | None: + """Returns a list of all global variable declarations in the given file. + + Args: + file_name (str): The name of the file to search in. + + Returns: + List[CVariable] | None: List of globals in the file, or None if file not found. + """ + if file_name in self.translation_units: + return self.translation_units[file_name].globals + return None diff --git a/cldk/analysis/c/clang/__init__.py b/cldk/analysis/c/clang/__init__.py new file mode 100644 index 0000000..f4e43a5 --- /dev/null +++ b/cldk/analysis/c/clang/__init__.py @@ -0,0 +1,23 @@ +################################################################################ +# Copyright IBM Corporation 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +Analysis package +""" + +from .clang_analyzer import ClangAnalyzer + +__all__ = ["ClangAnalyzer"] diff --git a/cldk/analysis/c/clang/clang_analyzer.py b/cldk/analysis/c/clang/clang_analyzer.py new file mode 100644 index 0000000..f7f5d51 --- /dev/null +++ b/cldk/analysis/c/clang/clang_analyzer.py @@ -0,0 +1,365 @@ +import os +import platform +from clang.cindex import Config +from pathlib import Path +from typing import List, Optional +from cldk.models.c import CFunction, CMacro, CCallSite, CTranslationUnit, CApplication +import logging +from ipdb import set_trace + +from cldk.models.c.models import CInclude, CParameter, CVariable, StorageClass + +logger = logging.getLogger(__name__) + +# First, we only import Config from clang.cindex +from clang.cindex import Config + + +def find_libclang() -> str: + """ + Locates the libclang library on the system based on the operating system. + This function runs before any other Clang functionality is used, ensuring + proper initialization of the Clang environment. + """ + system = platform.system() + + # On macOS, we check both Apple Silicon and Intel paths + if system == "Darwin": + possible_paths = [ + "/opt/homebrew/opt/llvm/lib/libclang.dylib", # Apple Silicon + "/usr/local/opt/llvm/lib/libclang.dylib", # Intel Mac + "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/libclang.dylib", + ] + install_instructions = "Install LLVM using: brew install llvm" + + # On Linux, we check various common installation paths + elif system == "Linux": + possible_paths = [ + "/usr/lib/llvm-14/lib/libclang.so", + "/usr/lib/llvm-13/lib/libclang.so", + "/usr/lib/llvm-12/lib/libclang.so", + "/usr/lib/x86_64-linux-gnu/libclang-14.so.1", + "/usr/lib/libclang.so", + ] + install_instructions = "Install libclang using: sudo apt-get install libclang-dev" + else: + raise RuntimeError(f"Unsupported operating system: {system}") + + # Check each possible path and return the first one that exists + for path in possible_paths: + if os.path.exists(path): + logger.info(f"Found libclang at: {path}") + return path + + # If no library is found, provide clear installation instructions + raise RuntimeError(f"Could not find libclang library. \n" f"Please ensure LLVM is installed:\n{install_instructions}") + + +# Initialize libclang at module level +try: + libclang_path = find_libclang() + Config.set_library_file(libclang_path) + logger.info("Successfully initialized libclang") + + # Now that libclang is initialized, we can safely import other Clang components + from clang.cindex import Index, TranslationUnit, CursorKind, TypeKind, CompilationDatabase + +except Exception as e: + logger.error(f"Failed to initialize libclang: {e}") + raise + + +class ClangAnalyzer: + """Analyzes C code using Clang's Python bindings.""" + + def __init__(self, compilation_database_path: Optional[Path] = None): + # Configure Clang before creating the Index + self.index = Index.create() + self.compilation_database = None + # TODO: Implement compilation database for C/C++ projects so that we can get compile arguments for each file + # and parse them correctly. This is useful for projects with complex build systems. + if compilation_database_path: + self.compilation_database = CompilationDatabase.fromDirectory(str(compilation_database_path)) + + @staticmethod + def __find_libclang(self) -> str: + """ + Find libclang library on the system. This function detects the operating system + and searches in platform-specific locations. + + Returns: + str: Path to the libclang library + + Raises: + RuntimeError: If libclang cannot be found in any of the expected locations + """ + system = platform.system() + + if system == "Darwin": # macOS + possible_paths = [ + # Apple Silicon Mac paths + "/opt/homebrew/opt/llvm/lib/libclang.dylib", + # Intel Mac paths + "/usr/local/opt/llvm/lib/libclang.dylib", + # Xcode path + "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/libclang.dylib", + ] + install_instructions = ( + "Could not find libclang. Please install LLVM using Homebrew:\n" + " 1. Run: brew install llvm\n" + " 2. Make sure the installation succeeded\n" + " 3. You might need to restart your terminal" + ) + elif system == "Linux": + possible_paths = [ + # Common Linux paths for different LLVM versions + "/usr/lib/llvm-14/lib/libclang.so", + "/usr/lib/llvm-13/lib/libclang.so", + "/usr/lib/llvm-12/lib/libclang.so", + "/usr/lib/x86_64-linux-gnu/libclang-14.so.1", + "/usr/lib/libclang.so", + ] + install_instructions = ( + "Could not find libclang. Please install LLVM development libraries:\n" + " Ubuntu/Debian: sudo apt-get install libclang-dev\n" + " Fedora: sudo dnf install clang-devel\n" + " Arch Linux: sudo pacman -S clang" + ) + else: + raise RuntimeError(f"Unsupported operating system: {system}") + + # Try to find the library in the possible locations + for path in possible_paths: + if os.path.exists(path): + logger.info(f"Found libclang at: {path}") + return path + + raise RuntimeError(install_instructions) + + def analyze_file(self, file_path: Path) -> CTranslationUnit: + """Analyzes a single C source file using Clang.""" + + # Get compilation arguments if available + compile_args = self._get_compile_args(file_path) + # Parse the file with Clang + tu = self.index.parse( + str(file_path), + args=compile_args, + options=TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD, + ) + + # Initialize our translation unit model + translation_unit = CTranslationUnit( + file_path=str(file_path), + is_header=file_path.suffix in {".h", ".hpp", ".hxx"}, + ) + + # Process all cursors in the translation unit + self._process_translation_unit(tu.cursor, translation_unit) + + return translation_unit + + def _process_translation_unit(self, cursor, translation_unit: CTranslationUnit): + """Processes all declarations in a translation unit.""" + + for child in cursor.get_children(): + if child.location.file and str(child.location.file) != translation_unit.file_path: + # Skip declarations from included files + continue + + elif child.kind == CursorKind.FUNCTION_DECL: + func = self._extract_function(child) + translation_unit.functions[func.name] = func + + elif child.kind == CursorKind.INCLUSION_DIRECTIVE: + include = self._process_inclusion(child) + translation_unit.includes.append(include) + + def _process_inclusion(self, cursor): + """ + Processes an include directive, capturing both the include type and the included file. + + Args: + cursor: Cursor to the include directive + translation_unit: Translation unit being processed + + Returns: + + + In C/C++, we have two main types of includes: + 1. System includes: #include - Usually for standard libraries + 2. Local includes: #include "header.h" - Usually for your own headers + + The function captures this distinction and stores additional metadata about + the inclusion. + """ + include_name = cursor.displayname + include_location = cursor.location + + # Get the full text of the include directive + tokens = list(cursor.get_tokens()) + full_text = " ".join(token.spelling for token in tokens) + + # Determine if this is a system include or local include + is_system_include = False + if tokens: + # Look at the actual tokens to see if it uses <> or "" + for token in tokens: + if token.spelling == "<": + is_system_include = True + break + + # Store more detailed information about the include + # include_info = {"name": include_name, "is_system": is_system_include, "line_number": include_location.line, "full_text": full_text} + return CInclude(name=include_name, is_system=is_system_include, line_number=include_location.line, full_text=full_text) + + def _extract_parameter(self, param) -> CParameter: + """ + Extracts parameter information, handling default values carefully. + + In C++, parameters can have default values, but accessing these requires + careful token handling since the tokens form a generator that can only + be consumed once. + """ + # First, let's safely get any default value + default_value = None + try: + # Convert the token generator to a list so we can examine it safely + tokens = list(param.get_tokens()) + if tokens: + # If we have tokens, the first one might be our default value + default_value = tokens[0].spelling + except Exception as e: + # If anything goes wrong getting tokens, we'll log it and continue + print(f"Warning: Could not extract default value for parameter {param.spelling}: {e}") + + return CParameter(name=param.spelling or f"arg_{param.type.spelling.replace(' ', '_')}", type=param.type.spelling, default_value=default_value) + + def _extract_variable(self, cursor) -> CVariable: + """Extracts detailed variable information from a cursor.""" + return CVariable( + name=cursor.spelling, + type=cursor.type.spelling, + is_static=cursor.storage_class == StorageClass.STATIC, + is_extern=cursor.storage_class == StorageClass.EXTERN, + is_const=cursor.type.is_const_qualified(), + is_volatile=cursor.type.is_volatile_qualified(), + start_line=cursor.extent.start.line, + end_line=cursor.extent.end.line, + ) + + def _extract_function_body(self, cursor) -> str: + """Extracts the body of a function. + + Args: + cursor: Cursor to the function + + Returns: + str: The function body + """ + if cursor.is_definition() == False: + return "" + + try: + tokens = list(cursor.get_tokens()) + try: + body_start = next(i for i, t in enumerate(tokens) if t.spelling == "{") + except: + return "" + + brace = 0 + body = [] + for token in tokens[body_start:]: + if token.spelling == "{": + brace += 1 + elif token.spelling == "}": + brace -= 1 + if brace == 0: + break + body.append(token.spelling) + + body_str = " ".join(body) + + if brace != 0: + logging.warning(f"Unbalanced braces in function body: {cursor.spelling}") + + return body_str + + except Exception as e: + logging.error(f"Error extracting function body: {e}") + return "" + + def _extract_function(self, cursor) -> CFunction: + """Extracts detailed function information from a cursor.""" + + # Get storage class + storage_class = None + for token in cursor.get_tokens(): + if token.spelling in {"static", "extern"}: + storage_class = StorageClass(token.spelling) + break + + # Get function parameters + parameters = [] + for param in cursor.get_arguments(): + parameters.append(self._extract_parameter(param)) + + # Collect call sites and local variables + call_sites = [] + local_vars = [] + if cursor.is_definition(): + for child in cursor.walk_preorder(): + if child.kind == CursorKind.CALL_EXPR: + call_sites.append(self._extract_call_site(child)) + elif child.kind == CursorKind.VAR_DECL: + local_vars.append(self._extract_variable(child)) + + # Get function body if this is a definition + body = self._extract_function_body(cursor) + return CFunction( + name=cursor.spelling, + return_type=cursor.result_type.spelling, + parameters=parameters, + storage_class=storage_class, + is_inline="inline" in cursor.get_tokens(), + is_variadic=cursor.type.is_function_variadic(), + body=body, + comment=cursor.brief_comment or "", + call_sites=call_sites, + local_variables=local_vars, + start_line=cursor.extent.start.line, + end_line=cursor.extent.end.line, + ) + + def _extract_call_site(self, cursor) -> CCallSite: + """Extracts information about a function call.""" + + # Determine if this is an indirect call (through function pointer) + is_indirect = cursor.referenced is None and cursor.type.kind == TypeKind.FUNCTIONPROTO + + # Get argument types + arg_types = [] + for arg in cursor.get_arguments(): + arg_types.append(arg.type.spelling) + + return CCallSite( + function_name=cursor.spelling, + argument_types=arg_types, + is_indirect_call=is_indirect, + return_type=cursor.type.get_result().spelling, + start_line=cursor.extent.start.line, + start_column=cursor.extent.start.column, + end_line=cursor.extent.end.line, + end_column=cursor.extent.end.column, + ) + + def _get_compile_args(self, file_path: Path) -> List[str]: + """Gets compilation arguments for a file.""" + if not self.compilation_database: + return ["-x", "c++", "-std=c++17"] + + commands = self.compilation_database.getCompileCommands(str(file_path)) + if commands: + cmd = commands[0] + return [arg for arg in cmd.arguments[1:] if arg != str(file_path)] + return ["-x", "c++", "-std=c++17"] diff --git a/cldk/analysis/common/__init__.py b/cldk/analysis/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cldk/analysis/common/lsp/__init__.py b/cldk/analysis/common/lsp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cldk/analysis/common/lsp/lsp.py b/cldk/analysis/common/lsp/lsp.py new file mode 100644 index 0000000..e69de29 diff --git a/cldk/core.py b/cldk/core.py index 6bcb8e6..b3c7f81 100644 --- a/cldk/core.py +++ b/cldk/core.py @@ -24,6 +24,7 @@ from typing import List from cldk.analysis import AnalysisLevel +from cldk.analysis.c import CAnalysis from cldk.analysis.java import JavaAnalysis from cldk.analysis.java.treesitter import JavaSitter from cldk.utils.exceptions import CldkInitializationException @@ -128,6 +129,8 @@ def analysis( target_files=target_files, eager_analysis=eager, ) + elif self.language == "c": + return CAnalysis(project_dir=project_path) else: raise NotImplementedError(f"Analysis support for {self.language} is not implemented yet.") @@ -146,7 +149,7 @@ def treesitter_parser(self): else: raise NotImplementedError(f"Treesitter parser for {self.language} is not implemented yet.") - def tree_sitter_utils(self, source_code: str) -> [TreesitterSanitizer | NotImplementedError]: + def tree_sitter_utils(self, source_code: str) -> [TreesitterSanitizer | NotImplementedError]: # type: ignore """ Parse the project using treesitter. diff --git a/cldk/models/c/__init__.py b/cldk/models/c/__init__.py new file mode 100644 index 0000000..451b1a8 --- /dev/null +++ b/cldk/models/c/__init__.py @@ -0,0 +1,21 @@ +################################################################################ +# Copyright IBM Corporation 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +""" +C/C++ package +""" + +from cldk.models.c.models import * diff --git a/cldk/models/c/models.py b/cldk/models/c/models.py new file mode 100644 index 0000000..5e011da --- /dev/null +++ b/cldk/models/c/models.py @@ -0,0 +1,313 @@ +from typing import Dict, List, Optional, Union +from pydantic import BaseModel, field_validator +from enum import Enum + + +class StorageClass(Enum): + """Represents C storage class specifiers.""" + + AUTO = "auto" + REGISTER = "register" + STATIC = "static" + EXTERN = "extern" + TYPEDEF = "typedef" + + +class CVariable(BaseModel): + """Represents a variable declaration in C. + + Attributes: + name (str): The name of the variable + type (str): The type of the variable (including any type qualifiers) + storage_class: The storage class specifier (if any) + is_const (bool): Whether the variable is const-qualified + is_volatile (bool): Whether the variable is volatile-qualified + initializer (str): Initial value expression, if any + array_dimensions (List[str]): Dimensions if this is an array variable + is_pointer (bool): Whether this is a pointer variable + pointer_level (int): Level of pointer indirection (e.g., 2 for char**) + """ + + name: str + type: str + storage_class: Optional[StorageClass] = None + is_const: bool = False + is_volatile: bool = False + initializer: Optional[str] = None + array_dimensions: List[str] = [] + is_pointer: bool = False + pointer_level: int = 0 + start_line: int + end_line: int + + +class CFunctionPointer(BaseModel): + """Represents a function pointer type. + + Attributes: + return_type (str): Return type of the function being pointed to + parameter_types (List[str]): Types of the parameters + calling_convention (Optional[str]): Calling convention if specified + """ + + return_type: str + parameter_types: List[str] + calling_convention: Optional[str] = None + + +class CMacro(BaseModel): + """Represents a C preprocessor macro. + + Attributes: + name (str): Name of the macro + parameters (List[str]): Parameters for function-like macros + replacement (str): Replacement text + is_function_like (bool): Whether this is a function-like macro + start_line (int): Starting line in source + end_line (int): Ending line in source + """ + + name: str + parameters: List[str] = [] + replacement: str + is_function_like: bool = False + start_line: int + end_line: int + + +class CParameter(BaseModel): + """Represents a parameter in a function declaration. + + Attributes: + name (str): Parameter name (may be empty in declarations) + type (str): Parameter type + is_const (bool): Whether parameter is const-qualified + is_volatile (bool): Whether parameter is volatile-qualified + is_pointer (bool): Whether parameter is a pointer + pointer_level (int): Level of pointer indirection + array_dimensions (List[str]): Array dimensions if parameter is array + """ + + name: str + type: str + is_const: bool = False + is_volatile: bool = False + is_pointer: bool = False + pointer_level: int = 0 + array_dimensions: List[str] = [] + + +class CCallSite(BaseModel): + """Represents a function call in C code. + + Attributes: + function_name (str): Name of the called function + argument_types (List[str]): Types of the arguments + is_indirect_call (bool): Whether this is a call through function pointer + is_macro_expansion (bool): Whether this call is from macro expansion + return_type (str): Return type of the called function + start_line (int): Starting line of the call + start_column (int): Starting column of the call + end_line (int): Ending line of the call + end_column (int): Ending column of the call + """ + + function_name: str + argument_types: List[str] + is_indirect_call: bool = False + is_macro_expansion: bool = False + return_type: str = "" + start_line: int + start_column: int + end_line: int + end_column: int + + +class CFunction(BaseModel): + """Represents a C function. + + Attributes: + name (str): Function name + return_type (str): Return type + parameters (List[CParameter]): Function parameters + storage_class (Optional[StorageClass]): Storage class if specified + is_inline (bool): Whether function is inline + is_const (bool): Whether function is const-qualified (C++) + is_variadic (bool): Whether function takes variable arguments + body (str): Function body code + comment (str): Associated comments/documentation + referenced_types (List[str]): Types referenced in function + accessed_globals (List[str]): Global variables accessed + call_sites (List[CCallSite]): Function calls made + local_variables (List[CVariable]): Local variable declarations + macros_used (List[str]): Macros used in function + start_line (int): Starting line in source + end_line (int): Ending line in source + cyclomatic_complexity (Optional[int]): Cyclomatic complexity if calculated + """ + + name: str + return_type: str + parameters: List[CParameter] + storage_class: Optional[StorageClass] = None + is_inline: bool = False + is_const: bool = False + is_variadic: bool = False + body: str + comment: str = "" + referenced_types: List[str] = [] + accessed_globals: List[str] = [] + call_sites: List[CCallSite] = [] + local_variables: List[CVariable] = [] + macros_used: List[str] = [] + start_line: int + end_line: int + cyclomatic_complexity: Optional[int] = None + + +class CStruct(BaseModel): + """Represents a C struct or union. + + Attributes: + name (str): Name of the struct + is_union (bool): Whether this is a union + members (List[CVariable]): Member variables + is_packed (bool): Whether struct is packed + alignment (Optional[int]): Specified alignment if any + comment (str): Associated comments + referenced_types (List[str]): Types referenced in struct + """ + + name: str + is_union: bool = False + members: List[CVariable] + is_packed: bool = False + alignment: Optional[int] = None + comment: str = "" + referenced_types: List[str] = [] + start_line: int + end_line: int + + +class CEnum(BaseModel): + """Represents a C enum declaration. + + Attributes: + name (str): Name of the enum + constants (Dict[str, int]): Enum constants and their values + comment (str): Associated comments + """ + + name: str + constants: Dict[str, int] + comment: str = "" + start_line: int + end_line: int + + +class CTypedef(BaseModel): + """Represents a typedef declaration. + + Attributes: + name (str): New type name being defined + underlying_type (str): The actual type being aliased + is_function_pointer (bool): Whether this is a function pointer typedef + function_pointer: Details if this is a function pointer typedef + """ + + name: str + underlying_type: str + is_function_pointer: bool = False + function_pointer: Optional[CFunctionPointer] = None + start_line: int + end_line: int + + +class CInclude(BaseModel): + """Represents a C include directive. + + Attributes: + name (str): Name of the included file + is_system (bool): Whether this is a system include + line_number (int): Line number in source + full_text (str): Full text of the include directive + """ + + name: str + is_system: bool + line_number: int + full_text: str + + +class CTranslationUnit(BaseModel): + """Represents a C source file. + + Attributes: + file_path (str): Path to the source file + includes (List[str]): Header files included + macros (List[CMacro]): Macro definitions + typedefs (List[CTypedef]): Typedef declarations + structs (List[CStruct]): Struct/union declarations + enums (List[CEnum]): Enum declarations + globals (List[CVariable]): Global variable declarations + functions (Dict[str, CFunction]): Function declarations/definitions + is_header (bool): Whether this is a header file + """ + + file_path: str + includes: List[CInclude] = [] + macros: List[CMacro] = [] + typedefs: List[CTypedef] = [] + structs: List[CStruct] = [] + enums: List[CEnum] = [] + globals: List[CVariable] = [] + functions: Dict[str, CFunction] = {} + is_header: bool = False + is_modified: bool = False + + +class CFunctionDetail(BaseModel): + """Represents detailed information about a function. + + Attributes: + function_declaration (str): Full function declaration + file_path (str): Path to the file containing the function + function (CFunction): Detailed function information + """ + + function_declaration: str + file_path: str + function: CFunction + + def __hash__(self): + return hash((self.function_declaration, self.file_path)) + + +class CCallGraphEdge(BaseModel): + """Represents an edge in the call graph. + + Attributes: + source (CFunctionDetail): Calling function + target (CFunctionDetail): Called function + type (str): Type of call relationship + weight (str): Edge weight/importance + is_indirect (bool): Whether this is through function pointer + """ + + source: CFunctionDetail + target: CFunctionDetail + type: str + weight: str + is_indirect: bool = False + + +class CApplication(BaseModel): + """Represents a complete C application. + + Attributes: + translation_units (Dict[str, CTranslationUnit]): All source files + call_graph (List[CCallGraphEdge]): Function call relationships + """ + + translation_units: Dict[str, CTranslationUnit] + call_graph: List[CCallGraphEdge] = [] diff --git a/poetry.lock b/poetry.lock index 40ce2be..a6bafee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -477,6 +477,17 @@ files = [ {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] +[[package]] +name = "clang" +version = "17.0.6" +description = "libclang python bindings" +optional = false +python-versions = "*" +files = [ + {file = "clang-17.0.6-py3-none-any.whl", hash = "sha256:d05ad6dddc9b360e94b9420e239c639a9117902cce8a57fd288a5226eea3092e"}, + {file = "clang-17.0.6.tar.gz", hash = "sha256:d228511e6a29e866dcbe99e10ed10649317b9b3e636ba805f6867b7afb6e8c44"}, +] + [[package]] name = "click" version = "8.1.7" @@ -1301,6 +1312,25 @@ files = [ {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"}, ] +[[package]] +name = "libclang" +version = "18.1.1" +description = "Clang Python Bindings, mirrored from the official LLVM repo: https://github.com/llvm/llvm-project/tree/main/clang/bindings/python, to make the installation process easier." +optional = false +python-versions = "*" +files = [ + {file = "libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a"}, + {file = "libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5"}, + {file = "libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8"}, + {file = "libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b"}, + {file = "libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592"}, + {file = "libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe"}, + {file = "libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f"}, + {file = "libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb"}, + {file = "libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8"}, + {file = "libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250"}, +] + [[package]] name = "markdown" version = "3.7" @@ -3407,4 +3437,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.11" -content-hash = "c4a5abeea039e19305c3981a9327477312ee04f6fc92086d3a48d00bd56b939f" +content-hash = "4217a1f1206f00d06f6b86a5225be08c9a4e66adc050befbbd714a9e77638a98" diff --git a/pyproject.toml b/pyproject.toml index 6acc889..74422ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cldk" -version = "0.4.0" +version = "0.5.0" description = "codellm-devkit: A python library for seamless integration with LLMs." authors = ["Rahul Krishna ", "Rangeet Pan ", "Saurabh Sinhas ", "Raju Pavuluri "] @@ -44,6 +44,8 @@ tree-sitter-go = "0.23.1" tree-sitter-python = "0.23.2" tree-sitter-javascript = "0.23.0" # Test dependencies +libclang = "^18.1.1" +clang = "^17.0.6" [tool.poetry.group.dev.dependencies] toml = "^0.10.2" From 1e924937890d75cb0902b79641de5bc9b6c8392a Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 29 Jan 2025 12:55:05 -0500 Subject: [PATCH 2/4] Refactor ClangAnalyzer and implement Signed-off-by: Rahul Krishna --- cldk/analysis/c/c_analysis.py | 4 +- cldk/analysis/c/clang/clang_analyzer.py | 55 ------------------------- 2 files changed, 1 insertion(+), 58 deletions(-) diff --git a/cldk/analysis/c/c_analysis.py b/cldk/analysis/c/c_analysis.py index a5e38d4..41817a7 100644 --- a/cldk/analysis/c/c_analysis.py +++ b/cldk/analysis/c/c_analysis.py @@ -188,9 +188,7 @@ def get_C_compilation_unit(self, file_path: str) -> CTranslationUnit: Returns: CTranslationUnit: Compilation unit object for C source file """ - if self.analysis_backend in [AnalysisEngine.CODEQL, AnalysisEngine.TREESITTER]: - raise NotImplementedError("Support for this functionality has not been implemented yet.") - return self.backend.get_C_compilation_unit(file_path) + return self.c_application.translation_units.get(file_path) def get_functions_in_file(self, file_name: str) -> List[CFunction]: """Returns a dictionary of all methods of the given class. diff --git a/cldk/analysis/c/clang/clang_analyzer.py b/cldk/analysis/c/clang/clang_analyzer.py index f7f5d51..0f4d9e6 100644 --- a/cldk/analysis/c/clang/clang_analyzer.py +++ b/cldk/analysis/c/clang/clang_analyzer.py @@ -81,61 +81,6 @@ def __init__(self, compilation_database_path: Optional[Path] = None): if compilation_database_path: self.compilation_database = CompilationDatabase.fromDirectory(str(compilation_database_path)) - @staticmethod - def __find_libclang(self) -> str: - """ - Find libclang library on the system. This function detects the operating system - and searches in platform-specific locations. - - Returns: - str: Path to the libclang library - - Raises: - RuntimeError: If libclang cannot be found in any of the expected locations - """ - system = platform.system() - - if system == "Darwin": # macOS - possible_paths = [ - # Apple Silicon Mac paths - "/opt/homebrew/opt/llvm/lib/libclang.dylib", - # Intel Mac paths - "/usr/local/opt/llvm/lib/libclang.dylib", - # Xcode path - "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/libclang.dylib", - ] - install_instructions = ( - "Could not find libclang. Please install LLVM using Homebrew:\n" - " 1. Run: brew install llvm\n" - " 2. Make sure the installation succeeded\n" - " 3. You might need to restart your terminal" - ) - elif system == "Linux": - possible_paths = [ - # Common Linux paths for different LLVM versions - "/usr/lib/llvm-14/lib/libclang.so", - "/usr/lib/llvm-13/lib/libclang.so", - "/usr/lib/llvm-12/lib/libclang.so", - "/usr/lib/x86_64-linux-gnu/libclang-14.so.1", - "/usr/lib/libclang.so", - ] - install_instructions = ( - "Could not find libclang. Please install LLVM development libraries:\n" - " Ubuntu/Debian: sudo apt-get install libclang-dev\n" - " Fedora: sudo dnf install clang-devel\n" - " Arch Linux: sudo pacman -S clang" - ) - else: - raise RuntimeError(f"Unsupported operating system: {system}") - - # Try to find the library in the possible locations - for path in possible_paths: - if os.path.exists(path): - logger.info(f"Found libclang at: {path}") - return path - - raise RuntimeError(install_instructions) - def analyze_file(self, file_path: Path) -> CTranslationUnit: """Analyzes a single C source file using Clang.""" From ad6bf1895b3033b14b78c7ae4a32baf2923f8ea1 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 29 Jan 2025 13:06:45 -0500 Subject: [PATCH 3/4] Anonymous variables are appropriately labeled. Signed-off-by: Rahul Krishna --- cldk/analysis/c/clang/clang_analyzer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cldk/analysis/c/clang/clang_analyzer.py b/cldk/analysis/c/clang/clang_analyzer.py index 0f4d9e6..e828632 100644 --- a/cldk/analysis/c/clang/clang_analyzer.py +++ b/cldk/analysis/c/clang/clang_analyzer.py @@ -166,19 +166,15 @@ def _extract_parameter(self, param) -> CParameter: careful token handling since the tokens form a generator that can only be consumed once. """ - # First, let's safely get any default value default_value = None try: - # Convert the token generator to a list so we can examine it safely tokens = list(param.get_tokens()) if tokens: - # If we have tokens, the first one might be our default value default_value = tokens[0].spelling except Exception as e: - # If anything goes wrong getting tokens, we'll log it and continue - print(f"Warning: Could not extract default value for parameter {param.spelling}: {e}") + logger.error(f"Warning: Could not extract default value for parameter {param.spelling}: {e}") - return CParameter(name=param.spelling or f"arg_{param.type.spelling.replace(' ', '_')}", type=param.type.spelling, default_value=default_value) + return CParameter(name=param.spelling or f"placeholder_arg_{param.type.spelling.replace(' ', '_')}", type=param.type.spelling, default_value=default_value) def _extract_variable(self, cursor) -> CVariable: """Extracts detailed variable information from a cursor.""" From 3891817fe1b90f09cd773dfa5ed07ebe44c55b53 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 5 Feb 2025 13:27:26 -0500 Subject: [PATCH 4/4] Update pyproject.toml --- pyproject.toml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3a85645..538e261 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,42 +29,42 @@ include = [ [tool.poetry.dependencies] python = ">=3.11" -pydantic = "^2.6.1" -pandas = "^2.2.0" -networkx = "^3.3" +pydantic = "^2.10.6" +pandas = "^2.2.3" +networkx = "^3.4.2" pyarrow = "19.0.0" tree-sitter = "0.24.0" rich = "13.9.4" wget = "3.2" -requests = "^2.31.0" +requests = "^2.32.3" tree-sitter-java = "0.23.5" tree-sitter-c = "0.23.4" tree-sitter-go = "0.23.4" tree-sitter-python = "0.23.6" tree-sitter-javascript = "0.23.1" -libclang = "^18.1.1" -clang = "^17.0.6" +# Test dependencies +mkdocs-material = {extras = ["imaging"], version = "^9.6.2"} [tool.poetry.group.dev.dependencies] toml = "^0.10.2" -pytest = "8.3.3" +pytest = "8.3.4" pytest-pspec = "^0.0.4" -pytest-cov = "^5.0.0" -pylint = "^3.2.2" -flake8 = "^7.0.0" -black = "^24.4.2" -coverage = "^7.5.3" -jupyter = "^1.0.0" +pytest-cov = "^6.0.0" +pylint = "^3.3.4" +flake8 = "^7.1.1" +black = "^25.1.0" +coverage = "^7.6.10" +jupyter = "^1.1.1" [tool.poetry.group.doc.dependencies] -mkdocs-material = {extras = ["imaging"], version = "^9.6.2"} +mkdocs-material = "^9.6.2" mkdocs-autorefs = "^1.3.0" mkdocs-get-deps = "^0.2.0" mkdocs-material-extensions = "^1.3.1" mkdocstrings = "^0.28.0" -mkdocstrings-python = "^1.13.0" +mkdocstrings-python = "^1.14.5" mkdocs-git-revision-date-localized-plugin = "^1.3.0" [build-system]