Add git history file overview treemap

JohT · JohT · commit 353de08dce07 · 2025-02-10T21:07:08.000+01:00
diff --git a/cypher/GitLog/List_git_files_directories.cypher b/cypher/GitLog/List_git_files_directories.cypher
@@ -0,0 +1,17 @@
+// List git file directories and the number of files they contain
+
+ MATCH (git_file:File&Git&!Repository)
+  WITH git_file.relativePath                                 AS gitFileName
+      ,reverse(split(reverse(git_file.relativePath),'/')[0]) AS gitFileNameWithoutPath
+  WITH *
+      ,rtrim(split(gitFileName, gitFileNameWithoutPath)[0], '/')  AS gitFilePath
+      ,count(DISTINCT gitFileName) AS numberOfContainedFiles
+ RETURN gitFilePath
+       ,last(split(gitFilePath, '/')) AS lastPathElement
+       ,size(split(gitFilePath, '/')) AS pathLength
+       ,count(DISTINCT gitFileName)   AS numberOfContainedFiles
+// Debugging
+//       ,gitFileName
+//       ,gitFileNameWithoutPath
+//       ,gitFileNameWithoutPath2
+ ORDER BY gitFilePath ASC
diff --git a/jupyter/GitHistoryJava.ipynb b/jupyter/GitHistoryJava.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2f0eabc4",
+   "metadata": {},
+   "source": [
+    "# Internal Dependencies\n",
+    "<br>  \n",
+    "\n",
+    "### References\n",
+    "- [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html)\n",
+    "- [Calculate metrics](https://101.jqassistant.org/calculate-metrics/index.html)\n",
+    "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "4191f259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from neo4j import GraphDatabase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "c57aadf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plot\n",
+    "import squarify"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "1c5dab37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n",
+    "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n",
+    "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n",
+    "\n",
+    "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n",
+    "driver.verify_connectivity()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "c1db254b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_cypher_query_from_file(cypherFileName):\n",
+    "    with open(cypherFileName) as file:\n",
+    "        return ' '.join(file.readlines())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "59310f6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n",
+    "    cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n",
+    "    cypher_query = get_cypher_query_from_file(filename)\n",
+    "    cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n",
+    "    records, summary, keys = driver.execute_query(cypher_query)\n",
+    "    return pd.DataFrame([r.values() for r in records], columns=keys)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "c09da482",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n",
+    "    \"\"\"\n",
+    "    Executes the Cypher queries of the given files and returns the first result that is not empty.\n",
+    "    If all given file names result in empty results, the last (empty) result will be returned.\n",
+    "    By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n",
+    "    \"\"\"    \n",
+    "    result=pd.DataFrame()\n",
+    "    for filename in filenames:\n",
+    "        result=query_cypher_to_data_frame(filename, limit)\n",
+    "        if not result.empty:\n",
+    "            return result\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "a56670c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n",
+    "#This is especially needed for PDF export of tables with multiple columns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "006b9dc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%html\n",
+    "<style>\n",
+    "/* CSS style for smaller dataframe tables. */\n",
+    ".dataframe th {\n",
+    "    font-size: 8px;\n",
+    "}\n",
+    ".dataframe td {\n",
+    "    font-size: 8px;\n",
+    "}\n",
+    "</style>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "6323e85e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Pandas DataFrame Display Configuration\n",
+    "pd.set_option('display.max_colwidth', 300)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "91d80bf7",
+   "metadata": {},
+   "source": [
+    "## Git History\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "dc682db6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git_file_directories = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_directories.cypher\", limit=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d5d23ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git_file_directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2a7c8ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure, axis = plot.subplots(figsize=(20,20))\n",
+    "axis.set_axis_off()\n",
+    "axis.set_title('Directories with the number of contained files')\n",
+    "squarify.plot(\n",
+    "    sizes=git_file_directories.numberOfContainedFiles, \n",
+    "    label=git_file_directories.lastPathElement,\n",
+    "    text_kwargs={'color':'white', 'fontsize':9, 'fontweight':'bold'},\n",
+    "    edgecolor=\"white\", \n",
+    "    linewidth=4,\n",
+    "    ax=axis\n",
+    ")\n",
+    "plot.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "authors": [
+   {
+    "name": "JohT"
+   }
+  ],
+  "code_graph_analysis_pipeline_data_validation": "ValidateJavaInternalDependencies",
+  "kernelspec": {
+   "display_name": "codegraph",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "title": "Git History Charts for Java with Neo4j"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/jupyter/environment.yml b/jupyter/environment.yml
@@ -15,5 +15,6 @@ dependencies:
   - opentsne=1.0.* # to visualize node embeddings in 2D (t-SNE dimensionality reduction)
   - wordcloud=1.9.*
   - monotonic=1.*
+  - squarify=0.4.* # to create tree maps e.g. for git history
   - pip:
       - neo4j==5.23.*