From 7ea6c2823bdf0bda4012e13a629d5f29fd8a86c3 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 21 Apr 2025 15:29:34 +0200 Subject: [PATCH 1/3] Add git history csv reports --- ..._directories_with_commit_statistics.cypher | 85 +++++++++++++++++++ scripts/reports/GitHistoryCsv.sh | 56 ++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher create mode 100755 scripts/reports/GitHistoryCsv.sh diff --git a/cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher b/cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher new file mode 100644 index 000000000..18ab2830d --- /dev/null +++ b/cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher @@ -0,0 +1,85 @@ +// List git file directories and their statistics + + MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file:Git&File&!Repository) + WHERE git_file.deletedAt IS NULL // filter out deleted files + ORDER BY git_file.relativePath + WITH * + ,datetime.fromepochMillis(git_file.createdAtEpoch) AS fileCreatedAtTimestamp + ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch)) AS fileLastModificationAtTimestamp + WITH *, git_repository.name + '/' + git_file.relativePath AS filePath + WITH *, split(filePath, '/') AS pathElements + WITH *, pathElements[-1] AS fileName + MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file) + WITH pathElements + ,fileCreatedAtTimestamp + ,fileLastModificationAtTimestamp + ,fileName + ,filePath AS fileRelativePath + ,split(git_commit.author, ' <')[0] AS author + ,max(git_commit.sha) AS maxCommitSha + ,collect(DISTINCT git_commit.sha) AS commitHashes + ,date(max(git_commit.date)) AS lastCommitDate +UNWIND pathElements AS pathElement + WITH * + ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent + WITH * + ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory + WHERE pathElement <> fileName + WITH directory AS directoryPath + ,split(directory, '/')[-1] AS directoryName + ,parent AS directoryParentPath + ,split(parent, '/')[-1] AS directoryParentName + ,size(split(directory, '/')) AS directoryPathLength + ,author + ,collect(DISTINCT fileRelativePath) AS files + ,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate + ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate + ,apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes))) AS commitHashes + ,max(maxCommitSha) AS maxCommitSha + ,max(lastCommitDate) AS lastCommitDate + ,max(fileRelativePath) AS maxFileRelativePath + ,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit + ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation + ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification +// Assure that the authors are ordered by their commit count descending per directory +ORDER BY directoryPath ASCENDING, size(commitHashes) DESCENDING + WITH directoryPath + ,directoryName + ,directoryParentPath + ,directoryParentName + ,directoryPathLength + ,collect(author)[0] AS mainAuthor + ,collect(author)[1] AS secondAuthor + ,collect(author)[2] AS thirdAuthor + ,count(DISTINCT author) AS authorCount + ,size(apoc.coll.toSet(apoc.coll.flatten(collect(files)))) AS fileCount + ,size(apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes)))) AS commitCount + ,max(lastCreationDate) AS lastCreationDate + ,max(lastModificationDate) AS lastModificationDate + ,max(maxCommitSha) AS maxCommitSha + ,max(lastCommitDate) AS lastCommitDate + ,min(daysSinceLastCommit) AS daysSinceLastCommit + ,min(daysSinceLastCreation) AS daysSinceLastCreation + ,min(daysSinceLastModification) AS daysSinceLastModification + ,max(maxFileRelativePath) AS maxFileRelativePath +// The final results are grouped by the statistic values like file count,... +RETURN collect(directoryPath)[-1] AS directoryPath + ,apoc.text.join(collect(directoryName), '/') AS directoryName + ,collect(directoryParentPath)[0] AS directoryParentPath + ,collect(directoryParentName)[0] AS directoryParentName + ,mainAuthor + ,secondAuthor + ,thirdAuthor + ,authorCount + ,fileCount + ,commitCount + ,lastCreationDate + ,lastModificationDate + ,lastCommitDate + ,daysSinceLastCommit + ,daysSinceLastCreation + ,daysSinceLastModification + ,maxCommitSha + ,maxFileRelativePath + ,max(directoryPathLength) AS directoryPathLength + ,count(DISTINCT directoryPath) AS combinedDirectoriesCount \ No newline at end of file diff --git a/scripts/reports/GitHistoryCsv.sh b/scripts/reports/GitHistoryCsv.sh new file mode 100755 index 000000000..77d357e5b --- /dev/null +++ b/scripts/reports/GitHistoryCsv.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# Executes "GitLog" Cypher queries to get the "git-history-csv" CSV reports. +# It contains lists of files with only one author, last changed or created files, pairwise changed files,... + +# Requires executeQueryFunctions.sh, cleanupAfterReportGeneration.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "GitHistoryCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts +echo "GitHistoryCsv: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"} +echo "GitHistoryCsv: CYPHER_DIR=${CYPHER_DIR}" + +# Define functions to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Create report directory +REPORT_NAME="git-history-csv" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Local Constants +GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog" + +echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Processing git history..." + +# Detailed git file statistics +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_with_commit_statistics_by_author.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_with_commit_statistics_by_author.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_that_were_changed_together_with_another_file.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_that_were_changed_together_with_another_file.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_statistics.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_file_directories_with_commit_statistics.csv" + +# Overall distribution of how many files were changed with one git commit, how many were changed with two, etc. +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv" + +# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv" + +# Clean-up after report generation. Empty reports will be deleted. +source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" + +echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file From 14dceef6c7eb38a376606a068b484f917cf8551b Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 25 Apr 2025 16:31:30 +0200 Subject: [PATCH 2/3] Fix git commitCount to only contain unique hashes --- ...es_with_commit_statistics_by_author.cypher | 1 + jupyter/GitHistoryGeneral.ipynb | 32 ++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher index 34d4e18fc..456e8b766 100644 --- a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher +++ b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher @@ -14,6 +14,7 @@ UNWIND git_files AS git_file RETURN git_repository.name + '/' + git_file.relativePath AS filePath ,split(git_commit.author, ' <')[0] AS author ,count(DISTINCT git_commit.sha) AS commitCount + ,collect(DISTINCT git_commit.sha) AS commitHashes ,date(max(git_commit.date)) AS lastCommitDate ,max(date(fileCreatedAtTimestamp)) AS lastCreationDate ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index a100e5c95..37fad1113 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -493,9 +493,18 @@ " \"\"\"\n", " return values.iloc[1] if len(values) > 1 else None\n", "\n", - "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n", + "def get_flattened_unique_values(values: pd.Series):\n", " \"\"\"\n", - " Return the file count from an array of array of file paths.\n", + " Return an array of unique string values from an array of array of strings.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : Series : The pandas Series of values\n", + " \"\"\"\n", + " return np.unique(np.concatenate(values.to_list()))\n", + "\n", + "def count_unique_aggregated_values(values: pd.Series):\n", + " \"\"\"\n", + " Return the number of unique values from an array of array of strings.\n", " Meant to be used as an aggregation function for dataframe grouping.\n", " values : Series : The pandas Series of values\n", " return : int : The number of files\n", @@ -573,7 +582,6 @@ "# Define how common non-grouped columns will be aggregated.\n", "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n", "common_named_aggregation = dict(\n", - " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n", " daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n", " daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n", @@ -588,12 +596,14 @@ " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n", + " commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n", + " intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n", " **common_named_aggregation\n", ")\n", "\n", "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n", "# The author with the most commits will then be listed first for each directory.\n", - "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n", "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", "\n", "# Debug\n", @@ -603,12 +613,13 @@ "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n", "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n", "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", - " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n", " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n", " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", + " commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n", " **common_named_aggregation\n", ")\n", "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", @@ -669,6 +680,17 @@ "git_files_with_commit_statistics.head(30)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "53fcd8b2", + "metadata": {}, + "outputs": [], + "source": [ + "# Print prepared data frame to CSV file\n", + "# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)" + ] + }, { "cell_type": "markdown", "id": "ccc11f52", From 2d0b800c48beb80164dd9a5c8f5d145d6923b991 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 25 Apr 2025 20:02:27 +0200 Subject: [PATCH 3/3] Use PREPARE_CONDA_ENVIRONMENT to fully skip conda --- scripts/activateCondaEnvironment.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/activateCondaEnvironment.sh b/scripts/activateCondaEnvironment.sh index 198ca28bd..7e12dc718 100755 --- a/scripts/activateCondaEnvironment.sh +++ b/scripts/activateCondaEnvironment.sh @@ -37,10 +37,10 @@ echo "activateCondaEnvironment: CONDA_PREFIX=${CONDA_PREFIX}" echo "activateCondaEnvironment: Current conda environment=${CONDA_DEFAULT_ENV}" echo "activateCondaEnvironment: Target conda environment=${CODEGRAPH_CONDA_ENVIRONMENT}" -PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare then Conda environment if needed (default, "true") or use an already prepared Conda environment ("false") +PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false") -if [ "${CONDA_DEFAULT_ENV}" = "${CODEGRAPH_CONDA_ENVIRONMENT}" ] && [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then - echo "activateCondaEnvironment: Skipping activation. Target conda environment ${CODEGRAPH_CONDA_ENVIRONMENT} is already activated." +if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then + echo "activateCondaEnvironment: Skipping activation. ${PREPARE_CONDA_ENVIRONMENT} is set to false." # "return" needs to be used here instead of "exit". # This script is included in another script by using "source". # "exit" would end the main script, "return" just ends this sub script.