Skip to content

Add git history csv reports #372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// List git file directories and their statistics

MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file:Git&File&!Repository)
WHERE git_file.deletedAt IS NULL // filter out deleted files
ORDER BY git_file.relativePath
WITH *
,datetime.fromepochMillis(git_file.createdAtEpoch) AS fileCreatedAtTimestamp
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch)) AS fileLastModificationAtTimestamp
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
WITH *, split(filePath, '/') AS pathElements
WITH *, pathElements[-1] AS fileName
MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file)
WITH pathElements
,fileCreatedAtTimestamp
,fileLastModificationAtTimestamp
,fileName
,filePath AS fileRelativePath
,split(git_commit.author, ' <')[0] AS author
,max(git_commit.sha) AS maxCommitSha
,collect(DISTINCT git_commit.sha) AS commitHashes
,date(max(git_commit.date)) AS lastCommitDate
UNWIND pathElements AS pathElement
WITH *
,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent
WITH *
,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory
WHERE pathElement <> fileName
WITH directory AS directoryPath
,split(directory, '/')[-1] AS directoryName
,parent AS directoryParentPath
,split(parent, '/')[-1] AS directoryParentName
,size(split(directory, '/')) AS directoryPathLength
,author
,collect(DISTINCT fileRelativePath) AS files
,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate
,apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes))) AS commitHashes
,max(maxCommitSha) AS maxCommitSha
,max(lastCommitDate) AS lastCommitDate
,max(fileRelativePath) AS maxFileRelativePath
,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit
,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation
,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification
// Assure that the authors are ordered by their commit count descending per directory
ORDER BY directoryPath ASCENDING, size(commitHashes) DESCENDING
WITH directoryPath
,directoryName
,directoryParentPath
,directoryParentName
,directoryPathLength
,collect(author)[0] AS mainAuthor
,collect(author)[1] AS secondAuthor
,collect(author)[2] AS thirdAuthor
,count(DISTINCT author) AS authorCount
,size(apoc.coll.toSet(apoc.coll.flatten(collect(files)))) AS fileCount
,size(apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes)))) AS commitCount
,max(lastCreationDate) AS lastCreationDate
,max(lastModificationDate) AS lastModificationDate
,max(maxCommitSha) AS maxCommitSha
,max(lastCommitDate) AS lastCommitDate
,min(daysSinceLastCommit) AS daysSinceLastCommit
,min(daysSinceLastCreation) AS daysSinceLastCreation
,min(daysSinceLastModification) AS daysSinceLastModification
,max(maxFileRelativePath) AS maxFileRelativePath
// The final results are grouped by the statistic values like file count,...
RETURN collect(directoryPath)[-1] AS directoryPath
,apoc.text.join(collect(directoryName), '/') AS directoryName
,collect(directoryParentPath)[0] AS directoryParentPath
,collect(directoryParentName)[0] AS directoryParentName
,mainAuthor
,secondAuthor
,thirdAuthor
,authorCount
,fileCount
,commitCount
,lastCreationDate
,lastModificationDate
,lastCommitDate
,daysSinceLastCommit
,daysSinceLastCreation
,daysSinceLastModification
,maxCommitSha
,maxFileRelativePath
,max(directoryPathLength) AS directoryPathLength
,count(DISTINCT directoryPath) AS combinedDirectoriesCount
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ UNWIND git_files AS git_file
RETURN git_repository.name + '/' + git_file.relativePath AS filePath
,split(git_commit.author, ' <')[0] AS author
,count(DISTINCT git_commit.sha) AS commitCount
,collect(DISTINCT git_commit.sha) AS commitHashes
,date(max(git_commit.date)) AS lastCommitDate
,max(date(fileCreatedAtTimestamp)) AS lastCreationDate
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate
Expand Down
32 changes: 27 additions & 5 deletions jupyter/GitHistoryGeneral.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -493,9 +493,18 @@
" \"\"\"\n",
" return values.iloc[1] if len(values) > 1 else None\n",
"\n",
"def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
"def get_flattened_unique_values(values: pd.Series):\n",
" \"\"\"\n",
" Return the file count from an array of array of file paths.\n",
" Return an array of unique string values from an array of array of strings.\n",
" Meant to be used as an aggregation function for dataframe grouping.\n",
" values : Series : The pandas Series of values\n",
" return : Series : The pandas Series of values\n",
" \"\"\"\n",
" return np.unique(np.concatenate(values.to_list()))\n",
"\n",
"def count_unique_aggregated_values(values: pd.Series):\n",
" \"\"\"\n",
" Return the number of unique values from an array of array of strings.\n",
" Meant to be used as an aggregation function for dataframe grouping.\n",
" values : Series : The pandas Series of values\n",
" return : int : The number of files\n",
Expand Down Expand Up @@ -573,7 +582,6 @@
"# Define how common non-grouped columns will be aggregated.\n",
"# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
"common_named_aggregation = dict(\n",
" commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
" daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
" daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
" daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
Expand All @@ -588,12 +596,14 @@
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
" fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
" commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n",
" intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n",
" **common_named_aggregation\n",
")\n",
"\n",
"# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
"# The author with the most commits will then be listed first for each directory.\n",
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n",
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
"\n",
"# Debug\n",
Expand All @@ -603,12 +613,13 @@
"# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
"# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
" mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
" commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n",
" **common_named_aggregation\n",
")\n",
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
Expand Down Expand Up @@ -669,6 +680,17 @@
"git_files_with_commit_statistics.head(30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53fcd8b2",
"metadata": {},
"outputs": [],
"source": [
"# Print prepared data frame to CSV file\n",
"# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)"
]
},
{
"cell_type": "markdown",
"id": "ccc11f52",
Expand Down
6 changes: 3 additions & 3 deletions scripts/activateCondaEnvironment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ echo "activateCondaEnvironment: CONDA_PREFIX=${CONDA_PREFIX}"
echo "activateCondaEnvironment: Current conda environment=${CONDA_DEFAULT_ENV}"
echo "activateCondaEnvironment: Target conda environment=${CODEGRAPH_CONDA_ENVIRONMENT}"

PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare then Conda environment if needed (default, "true") or use an already prepared Conda environment ("false")
PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false")

if [ "${CONDA_DEFAULT_ENV}" = "${CODEGRAPH_CONDA_ENVIRONMENT}" ] && [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
echo "activateCondaEnvironment: Skipping activation. Target conda environment ${CODEGRAPH_CONDA_ENVIRONMENT} is already activated."
if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
echo "activateCondaEnvironment: Skipping activation. ${PREPARE_CONDA_ENVIRONMENT} is set to false."
# "return" needs to be used here instead of "exit".
# This script is included in another script by using "source".
# "exit" would end the main script, "return" just ends this sub script.
Expand Down
56 changes: 56 additions & 0 deletions scripts/reports/GitHistoryCsv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash

# Executes "GitLog" Cypher queries to get the "git-history-csv" CSV reports.
# It contains lists of files with only one author, last changed or created files, pairwise changed files,...

# Requires executeQueryFunctions.sh, cleanupAfterReportGeneration.sh

# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
set -o errexit -o pipefail

# Overrideable Constants (defaults also defined in sub scripts)
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}

## Get this "scripts/reports" directory if not already set
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "GitHistoryCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"

# Get the "scripts" directory by taking the path of this script and going one directory up.
SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts
echo "GitHistoryCsv: SCRIPTS_DIR=${SCRIPTS_DIR}"

# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"}
echo "GitHistoryCsv: CYPHER_DIR=${CYPHER_DIR}"

# Define functions to execute cypher queries from within a given file
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"

# Create report directory
REPORT_NAME="git-history-csv"
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
mkdir -p "${FULL_REPORT_DIRECTORY}"

# Local Constants
GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog"

echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Processing git history..."

# Detailed git file statistics
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_with_commit_statistics_by_author.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_with_commit_statistics_by_author.csv"
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_that_were_changed_together_with_another_file.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_that_were_changed_together_with_another_file.csv"
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_statistics.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_file_directories_with_commit_statistics.csv"

# Overall distribution of how many files were changed with one git commit, how many were changed with two, etc.
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv"

# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"

# Clean-up after report generation. Empty reports will be deleted.
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"

echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
Loading