diff --git a/.github/workflows/quick-checks.yml b/.github/workflows/quick-checks.yml new file mode 100644 index 0000000..c00aff9 --- /dev/null +++ b/.github/workflows/quick-checks.yml @@ -0,0 +1,51 @@ +--- +name: Quick Checks +env: + KHIOPS_PYTHON_REVISION: 171-implement-multi-table-helper-functions +on: + pull_request: + workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install khiops-python dev dependencies + run: | + pip install pre-commit + - name: Run pre-commit checks + uses: pre-commit/action@v3.0.1 + + execute-nb: + runs-on: ubuntu-22.04 + container: + image: ghcr.io/khiopsml/khiops-python/khiopspydev-ubuntu22.04:10.2.3-b.5.0 + steps: + - name: Checkout sources + uses: actions/checkout@v4 + - name: Install pre-requisites + run: | + pip install nbconvert nbformat jupyter + - name: Install khiops-python + run: | + git clone https://github.com/khiopsml/khiops-python + cd khiops-python + git switch $KHIOPS_PYTHON_REVISION + pip install . + kh-status + kh-download-datasets + - name: Execute the convert hook + run: | + mkdir output_nb + export KHIOPS_PROC_NUMBER=1 + python khiops-python/doc/convert_tutorials.py --execute-notebooks ./ output_nb + ls -ltr output_nb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 183bef9..a467bea 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,9 +3,7 @@ repos: rev: 22.10.0 hooks: - id: black - language_version: python3.9 - id: black-jupyter - language_version: python3.9 - repo: local hooks: - id: my-nb-clean diff --git a/Core Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb b/Core Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb index 938cee3..8d73c85 100644 --- a/Core Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb +++ b/Core Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb @@ -7,7 +7,9 @@ "# Core Basics 1: Train, Evaluate and Deploy a Classifier\n", "In this lesson we will learn how to train, evaluate and deploy classifiers with Khiops.\n", "\n", - "We start by importing Khiops, some helper functions and saving the location of the Khiops `Samples` directory into a variable" + "Make sure you have installed [Khiops](https://khiops.org/setup/) and [Khiops Visualization](https://khiops.org/setup/visualization/).\n", + "\n", + "We start by importing Khiops and defining some helper functions:" ] }, { @@ -16,13 +18,32 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", - "\n", + "import os\n", + "import platform\n", + "import subprocess\n", "from khiops import core as kh\n", - "from helper_functions import explorer_open, peek\n", "\n", - "samples_dir = kh.get_samples_dir()\n", - "print(f\"Khiops samples directory located at {samples_dir}\")" + "# Define helper functions\n", + "def peek(file_path, n=10):\n", + " \"\"\"Shows the first n lines of a file\"\"\"\n", + " with open(file_path, encoding=\"utf8\", errors=\"replace\") as file:\n", + " for line in file.readlines()[:n]:\n", + " print(line, end=\"\")\n", + " print(\"\")\n", + "\n", + "\n", + "def os_open(path):\n", + " \"\"\"Opens a file or directory with its default application\"\"\"\n", + " if platform.system() == \"Windows\":\n", + " os.startfile(path)\n", + " elif platform.system() == \"Darwin\":\n", + " subprocess.call([\"open\", path])\n", + " else:\n", + " subprocess.call([\"xdg-open\", path])\n", + "\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -46,18 +67,12 @@ "metadata": {}, "outputs": [], "source": [ - "iris_kdic = path.join(samples_dir, \"Iris\", \"Iris.kdic\")\n", - "iris_data_file = path.join(samples_dir, \"Iris\", \"Iris.txt\")\n", + "iris_kdic = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", + "iris_data_file = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", "\n", - "print(\"\")\n", - "print(f\"Iris dictionary file location: {iris_kdic}\")\n", - "print(\"\")\n", + "print(f\"Iris dictionary file: {iris_kdic}\")\n", "peek(iris_kdic)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "print(f\"Iris data location: {iris_data_file}\")\n", - "print(\"\")\n", + "print(f\"Iris data file: {iris_data_file}\\n\")\n", "peek(iris_data_file)" ] }, @@ -74,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "iris_results_dir = path.join(\"exercises\", \"Iris\")\n", + "iris_results_dir = os.path.join(\"exercises\", \"Iris\")\n", "print(f\"Iris results directory: {iris_results_dir}\")" ] }, @@ -101,15 +116,15 @@ " results_dir=iris_results_dir,\n", " max_trees=0, # by default Khiops constructs 10 decision tree variables\n", ")\n", - "print(f\"Iris report file located at: {iris_report}\")\n", - "print(f\"Iris modeling dictionary file located at: {iris_model_kdic}\")" + "print(f\"Iris report file: {iris_report}\")\n", + "print(f\"Iris modeling dictionary: {iris_model_kdic}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can verify that the result files were created in `iris_results_dir`. In the next sections, we'll use the file at `iris_report` to assess the models' performances and the file at `iris_model_kdic` to deploy it." + "You can verify that the result files were created in `iris_results_dir`. In the next sections, we'll use the file at `iris_report` to assess the models' performances and the file at `iris_model_kdic` to deploy it. Now we can see the report with the Khiops Visualization app:" ] }, { @@ -118,8 +133,8 @@ "metadata": {}, "outputs": [], "source": [ - "# To take a look at the directory where the resulting files are stored\n", - "# explorer_open(iris_results_dir)" + "# To visualize uncomment the line below\n", + "# os_open(iris_report)" ] }, { @@ -139,8 +154,8 @@ "metadata": {}, "outputs": [], "source": [ - "adult_kdic = path.join(samples_dir, \"Adult\", \"Adult.kdic\")\n", - "adult_data_file = path.join(samples_dir, \"Adult\", \"Adult.txt\")" + "adult_kdic = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "adult_data_file = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")" ] }, { @@ -158,15 +173,9 @@ }, "outputs": [], "source": [ - "print(\"\")\n", - "print(f\"Adult dictionary file location: {adult_kdic}\")\n", - "print(\"\")\n", + "print(f\"Adult dictionary file: {adult_kdic}\")\n", "peek(adult_kdic)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "print(f\"Adult data location: {adult_data_file}\")\n", - "print(\"\")\n", + "print(f\"Adult data file: {adult_data_file}\\n\")\n", "peek(adult_data_file)" ] }, @@ -183,8 +192,8 @@ "metadata": {}, "outputs": [], "source": [ - "adult_results_dir = path.join(\"exercises\", \"Adult\")\n", - "print(f\"Adult exercise results directory: {adult_results_dir}\")" + "adult_results_dir = os.path.join(\"exercises\", \"Adult\")\n", + "print(f\"Adult results directory: {adult_results_dir}\")" ] }, { @@ -211,8 +220,27 @@ " results_dir=adult_results_dir,\n", " max_trees=0,\n", ")\n", - "print(f\"Adult report file located at: {adult_report}\")\n", - "print(f\"Adult modeling dictionary file located at: {adult_model_kdic}\")" + "print(f\"Adult report file: {adult_report}\")\n", + "print(f\"Adult modeling dictionary file: {adult_model_kdic}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Inspect the results with the Khiops Visualization app" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "is_khiops_tutorial_solution": true + }, + "outputs": [], + "source": [ + "# To visualize uncomment the line below\n", + "# os_open(adult_report)" ] }, { @@ -388,7 +416,7 @@ "metadata": {}, "outputs": [], "source": [ - "iris_deployment_file = path.join(iris_results_dir, \"iris_deployment.txt\")\n", + "iris_deployment_file = os.path.join(iris_results_dir, \"iris_deployment.txt\")\n", "kh.deploy_model(\n", " iris_model_kdic,\n", " dictionary_name=\"SNB_Iris\",\n", @@ -416,7 +444,7 @@ }, "outputs": [], "source": [ - "adult_deployment_file = path.join(adult_results_dir, \"adult_deployment.txt\")\n", + "adult_deployment_file = os.path.join(adult_results_dir, \"adult_deployment.txt\")\n", "kh.deploy_model(\n", " adult_model_kdic,\n", " dictionary_name=\"SNB_Adult\",\n", @@ -447,5 +475,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb b/Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb index 052e644..739c731 100644 --- a/Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb +++ b/Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb @@ -8,7 +8,9 @@ "\n", "In this notebook we learn how to train a classifier with a multi-table data composed of two tables (a root table and a secondary table). It is highly recommended to see the _Core Basics 1_ lesson if you are not familiar with Khiops.\n", "\n", - "We start by importing Khiops and some helper functions:" + "Make sure you have installed [Khiops](https://khiops.org/setup/) and [Khiops Visualization](https://khiops.org/setup/visualization/).\n", + "\n", + "We start by importing Khiops, checking its installation and defining some helper functions:" ] }, { @@ -17,10 +19,32 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", - "\n", + "import os\n", + "import platform\n", + "import subprocess\n", "from khiops import core as kh\n", - "from helper_functions import explorer_open, peek" + "\n", + "# Define helper functions\n", + "def peek(file_path, n=10):\n", + " \"\"\"Shows the first n lines of a file\"\"\"\n", + " with open(file_path, encoding=\"utf8\", errors=\"replace\") as file:\n", + " for line in file.readlines()[:n]:\n", + " print(line, end=\"\")\n", + " print(\"\")\n", + "\n", + "\n", + "def os_open(path):\n", + " \"\"\"Opens a file or directory with its default application\"\"\"\n", + " if platform.system() == \"Windows\":\n", + " os.startfile(path)\n", + " elif platform.system() == \"Darwin\":\n", + " subprocess.call([\"open\", path])\n", + " else:\n", + " subprocess.call([\"xdg-open\", path])\n", + "\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -75,10 +99,9 @@ "metadata": {}, "outputs": [], "source": [ - "sarcasm_kdic = path.join(\"data\", \"HeadlineSarcasm\", \"HeadlineSarcasm.kdic\")\n", + "sarcasm_kdic = os.path.join(\"data\", \"HeadlineSarcasm\", \"HeadlineSarcasm.kdic\")\n", "\n", - "print(\"\")\n", - "print(f\"HeadlineSarcasm dictionary file location: {sarcasm_kdic}\")\n", + "print(f\"HeadlineSarcasm dictionary file: {sarcasm_kdic}\")\n", "print(\"\")\n", "peek(sarcasm_kdic, n=15)" ] @@ -101,15 +124,13 @@ "metadata": {}, "outputs": [], "source": [ - "sarcasm_headlines_file = path.join(\"data\", \"HeadlineSarcasm\", \"Headlines.txt\")\n", - "sarcasm_words_file = path.join(\"data\", \"HeadlineSarcasm\", \"HeadlineWords.txt\")\n", + "sarcasm_headlines_file = os.path.join(\"data\", \"HeadlineSarcasm\", \"Headlines.txt\")\n", + "sarcasm_words_file = os.path.join(\"data\", \"HeadlineSarcasm\", \"HeadlineWords.txt\")\n", "\n", - "print(\"\")\n", - "print(f\"HeadlineSarcasm main table file location: {sarcasm_headlines_file}\")\n", + "print(f\"HeadlineSarcasm main table file: {sarcasm_headlines_file}\")\n", "print(\"\")\n", "peek(sarcasm_headlines_file, n=3)\n", "\n", - "print(\"\")\n", "print(f\"HeadlineSarcasm secondary table file location: {sarcasm_words_file}\")\n", "print(\"\")\n", "peek(sarcasm_words_file, n=15)" @@ -144,7 +165,7 @@ "metadata": {}, "outputs": [], "source": [ - "sarcasm_results_dir = path.join(\"exercises\", \"HeadlineSarcasm\")\n", + "sarcasm_results_dir = os.path.join(\"exercises\", \"HeadlineSarcasm\")\n", "\n", "sarcasm_report, sarcasm_model_kdic = kh.train_predictor(\n", " sarcasm_kdic,\n", @@ -173,7 +194,8 @@ "metadata": {}, "outputs": [], "source": [ - "# explorer_open(sarcasm_report)" + "# To visualize uncomment the line below\n", + "# os_open(sarcasm_report)" ] }, { @@ -193,7 +215,8 @@ "```\n", "+---------------+\n", "|Accidents |\n", - "+---------------+n|AccidentId* |\n", + "+---------------+\n", + "|AccidentId* |\n", "|Gravity |\n", "|Date |\n", "|Hour | +---------------+\n", @@ -218,11 +241,15 @@ "metadata": {}, "outputs": [], "source": [ - "accidents_kdic = path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.kdic\")\n", - "accidents_data_file = path.join(\n", + "accidents_kdic = os.path.join(\n", + " kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.kdic\"\n", + ")\n", + "accidents_data_file = os.path.join(\n", " kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"\n", ")\n", - "vehicles_data_file = path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Vehicles.txt\")" + "vehicles_data_file = os.path.join(\n", + " kh.get_samples_dir(), \"AccidentsSummary\", \"Vehicles.txt\"\n", + ")" ] }, { @@ -241,18 +268,15 @@ }, "outputs": [], "source": [ - "print(\"\")\n", - "print(f\"Accidents dictionary file location: {accidents_kdic}\")\n", + "print(f\"Accidents dictionary file: {accidents_kdic}\")\n", "print(\"\")\n", "peek(accidents_kdic, n=40)\n", "\n", - "print(\"\")\n", - "print(f\"Accidents data table location: {accidents_data_file}\")\n", + "print(f\"Accidents (main) data table: {accidents_data_file}\")\n", "print(\"\")\n", "peek(accidents_data_file)\n", "\n", - "print(\"\")\n", - "print(f\"Vehicles main data table location: {vehicles_data_file}\")\n", + "print(f\"Vehicles data table: {vehicles_data_file}\")\n", "print(\"\")\n", "peek(vehicles_data_file)" ] @@ -270,7 +294,7 @@ "metadata": {}, "outputs": [], "source": [ - "accidents_results_dir = path.join(\"exercises\", \"AccidentSummary\")\n", + "accidents_results_dir = os.path.join(\"exercises\", \"AccidentSummary\")\n", "print(f\"AccidentsSummary exercise results directory: {accidents_results_dir}\")" ] }, @@ -278,7 +302,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Train a classifier for the `AccidentsEn` database with 1000 variables\n", + "#### Train a classifier for the `Accidents` database with 1000 variables\n", "Save the resulting file locations into the variables `accidents_report` and `accidents_model_kdic` and print them.\n", "\n", "Do not forget:\n", @@ -305,8 +329,8 @@ " max_constructed_variables=1000,\n", " max_trees=0,\n", ")\n", - "print(f\"AccidentsSummary report file located at: {accidents_report}\")\n", - "print(f\"AccidentsSummary modeling dictionary file located at: {accidents_model_kdic}\")" + "print(f\"AccidentsSummary report file: {accidents_report}\")\n", + "print(f\"AccidentsSummary modeling dictionary: {accidents_model_kdic}\")" ] }, { @@ -320,10 +344,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "is_khiops_tutorial_solution": true + }, "outputs": [], "source": [ - "# explorer_open(accidents_report)" + "# To visualize uncomment the line below\n", + "# os_open(accidents_report)" ] } ], @@ -348,4 +375,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/Core Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb b/Core Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb index c7d40c3..7066da6 100644 --- a/Core Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb +++ b/Core Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb @@ -8,7 +8,9 @@ "\n", "In this notebook, we learn how to train a classifier with a more complex multi-table data where a secondary table is itself a parent table of another table (ie. snowflake schema). It is highly recommended to see the _Basics 1_ and _Basics 2_ lessons if you are not familiar with Khiops.\n", "\n", - "We start by importing `khiops` and some helper functions:" + "Make sure you have installed [Khiops](https://khiops.org/setup/) and [Khiops Visualization](https://khiops.org/setup/visualization/).\n", + "\n", + "We start by importing Khiops, checking its installation and defining some helper functions:" ] }, { @@ -17,10 +19,32 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", - "\n", + "import os\n", + "import platform\n", + "import subprocess\n", "from khiops import core as kh\n", - "from helper_functions import explorer_open, peek" + "\n", + "# Define helper functions\n", + "def peek(file_path, n=10):\n", + " \"\"\"Shows the first n lines of a file\"\"\"\n", + " with open(file_path, encoding=\"utf8\", errors=\"replace\") as file:\n", + " for line in file.readlines()[:n]:\n", + " print(line, end=\"\")\n", + " print(\"\")\n", + "\n", + "\n", + "def os_open(path):\n", + " \"\"\"Opens a file or directory with its default application\"\"\"\n", + " if platform.system() == \"Windows\":\n", + " os.startfile(path)\n", + " elif platform.system() == \"Darwin\":\n", + " subprocess.call([\"open\", path])\n", + " else:\n", + " subprocess.call([\"xdg-open\", path])\n", + "\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -52,13 +76,12 @@ "metadata": {}, "outputs": [], "source": [ - "accidents_dataset_dir = path.join(kh.get_samples_dir(), \"Accidents\")\n", - "accidents_kdic = path.join(accidents_dataset_dir, \"Accidents.kdic\")\n", + "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_kdic = os.path.join(accidents_dataset_dir, \"Accidents.kdic\")\n", "\n", - "print(\"\")\n", "print(f\"Accidents dictionary file location: {accidents_kdic}\")\n", "print(\"\")\n", - "peek(accidents_kdic, n=40)" + "peek(accidents_kdic, n=45)" ] }, { @@ -79,27 +102,23 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"\")\n", - "accidents_data_file = path.join(accidents_dataset_dir, \"Accidents.txt\")\n", - "print(f\"Accidents data table location: {accidents_data_file}\")\n", + "accidents_data_file = os.path.join(accidents_dataset_dir, \"Accidents.txt\")\n", + "print(f\"Accidents data table: {accidents_data_file}\")\n", "print(\"\")\n", "peek(accidents_data_file)\n", "\n", - "print(\"\")\n", - "vehicles_data_file = path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", - "print(f\"Vehicles data table location: {vehicles_data_file}\")\n", + "vehicles_data_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", + "print(f\"Vehicles data table: {vehicles_data_file}\")\n", "print(\"\")\n", "peek(vehicles_data_file)\n", "\n", - "print(\"\")\n", - "places_data_file = path.join(accidents_dataset_dir, \"Places.txt\")\n", - "print(f\"Places data table location: {places_data_file}\")\n", + "places_data_file = os.path.join(accidents_dataset_dir, \"Places.txt\")\n", + "print(f\"Places data table: {places_data_file}\")\n", "print(\"\")\n", "peek(places_data_file)\n", "\n", - "print(\"\")\n", - "users_data_file = path.join(accidents_dataset_dir, \"Users.txt\")\n", - "print(f\"Users data table location: {users_data_file}\")\n", + "users_data_file = os.path.join(accidents_dataset_dir, \"Users.txt\")\n", + "print(f\"Users data table: {users_data_file}\")\n", "print(\"\")\n", "peek(users_data_file)" ] @@ -131,7 +150,7 @@ "metadata": {}, "outputs": [], "source": [ - "accidents_results_dir = path.join(\"exercises\", \"Accidents\")\n", + "accidents_results_dir = os.path.join(\"exercises\", \"Accidents\")\n", "accidents_report, accidents_model_kdic = kh.train_predictor(\n", " accidents_kdic,\n", " dictionary_name=\"Accident\",\n", @@ -146,8 +165,8 @@ " max_constructed_variables=1000,\n", " max_trees=0,\n", ")\n", - "print(f\"Accidents report file located at: {accidents_report}\")\n", - "print(f\"Accidents modeling dictionary file located at: {accidents_model_kdic}\")" + "print(f\"Accidents report file: {accidents_report}\")\n", + "print(f\"Accidents modeling dictionary file: {accidents_model_kdic}\")" ] }, { @@ -164,7 +183,8 @@ "metadata": {}, "outputs": [], "source": [ - "# explorer_open(accidents_report)" + "# To visualize uncomment the line below\n", + "# os_open(accidents_report)" ] } ], @@ -189,4 +209,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/Core Basics 4 - Train a Coclustering.ipynb b/Core Basics 4 - Train a Coclustering.ipynb index ccf13ec..7c18627 100644 --- a/Core Basics 4 - Train a Coclustering.ipynb +++ b/Core Basics 4 - Train a Coclustering.ipynb @@ -8,7 +8,9 @@ "# Core Basics 4: Train a Coclustering\n", "The steps to train a coclustering model with Khiops are very similar to what we have already seen in the basic classifier tutorials.\n", "\n", - "We now execute the tutorial setup:" + "Make sure you have installed [Khiops](https://khiops.org/setup/) and [Khiops CoVisualization](https://khiops.org/setup/visualization/).\n", + "\n", + "We start by importing Khiops, checking its installation and defining some helper functions:" ] }, { @@ -17,9 +19,32 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", + "import os\n", + "import platform\n", + "import subprocess\n", "from khiops import core as kh\n", - "from helper_functions import explorer_open, peek" + "\n", + "# Define helper functions\n", + "def peek(file_path, n=10):\n", + " \"\"\"Shows the first n lines of a file\"\"\"\n", + " with open(file_path, encoding=\"utf8\", errors=\"replace\") as file:\n", + " for line in file.readlines()[:n]:\n", + " print(line, end=\"\")\n", + " print(\"\")\n", + "\n", + "\n", + "def os_open(path):\n", + " \"\"\"Opens a file or directory with its default application\"\"\"\n", + " if platform.system() == \"Windows\":\n", + " os.startfile(path)\n", + " elif platform.system() == \"Darwin\":\n", + " subprocess.call([\"open\", path])\n", + " else:\n", + " subprocess.call([\"xdg-open\", path])\n", + "\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -35,19 +60,17 @@ "metadata": {}, "outputs": [], "source": [ - "countries_kdic = path.join(\n", + "countries_kdic = os.path.join(\n", " \"data\", \"CountriesByOrganization\", \"CountriesByOrganization.kdic\"\n", ")\n", - "countries_data_file = path.join(\n", + "countries_data_file = os.path.join(\n", " \"data\", \"CountriesByOrganization\", \"CountriesByOrganization.csv\"\n", ")\n", "\n", - "print(\"\")\n", "print(f\"CountriesByOrganization dictionary file location: {countries_kdic}\")\n", "print(\"\")\n", - "peek(countries_kdic, n=15)\n", + "peek(countries_kdic)\n", "\n", - "print(\"\")\n", "print(f\"CountriesByOrganization data table file location: {countries_data_file}\")\n", "print(\"\")\n", "peek(countries_data_file)" @@ -66,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "countries_results_dir = path.join(\"exercises\", \"CountriesByOrganization\")\n", + "countries_results_dir = os.path.join(\"exercises\", \"CountriesByOrganization\")\n", "\n", "countries_cc_report = kh.train_coclustering(\n", " countries_kdic,\n", @@ -91,7 +114,8 @@ "metadata": {}, "outputs": [], "source": [ - "# explorer_open(path.dirname(countries_cc_report))" + "# To visualize uncomment the line below\n", + "# os_open(countries_cc_report)" ] }, { @@ -107,7 +131,7 @@ "metadata": {}, "outputs": [], "source": [ - "country_clusters_file = path.join(\n", + "country_clusters_file = os.path.join(\n", " \"exercises\", \"CountriesByOrganization\", \"CountryClusters.txt\"\n", ")\n", "kh.extract_clusters(\n", @@ -139,9 +163,9 @@ "metadata": {}, "outputs": [], "source": [ - "tokyo_kdic = path.join(\"data\", \"Tokyo2021\", \"Athletes.kdic\")\n", - "tokyo_data_file = path.join(\"data\", \"Tokyo2021\", \"Athletes.csv\")\n", - "tokyo_results_dir = path.join(\"exercises\", \"Tokyo2021\")" + "tokyo_kdic = os.path.join(\"data\", \"Tokyo2021\", \"Athletes.kdic\")\n", + "tokyo_data_file = os.path.join(\"data\", \"Tokyo2021\", \"Athletes.csv\")\n", + "tokyo_results_dir = os.path.join(\"exercises\", \"Tokyo2021\")" ] }, { @@ -157,13 +181,11 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"\")\n", - "print(f\"Tokyo2021 dictionary file location: {tokyo_kdic}\")\n", + "print(f\"Tokyo2021 dictionary file: {tokyo_kdic}\")\n", "print(\"\")\n", "peek(tokyo_kdic, n=15)\n", "\n", - "print(\"\")\n", - "print(f\"Tokyo data table file location: {tokyo_data_file}\")\n", + "print(f\"Tokyo data table file: {tokyo_data_file}\")\n", "print(\"\")\n", "peek(tokyo_data_file)" ] @@ -207,7 +229,8 @@ "metadata": {}, "outputs": [], "source": [ - "# explorer_open(path.dirname(tokyo_coclustering_report))" + "# To visualize uncomment the line below\n", + "# os_open(tokyo_cc_report)" ] }, { @@ -225,7 +248,9 @@ }, "outputs": [], "source": [ - "tokyo_country_clusters_file = path.join(\"exercises\", \"Tokyo2021\", \"CountryClusters.txt\")\n", + "tokyo_country_clusters_file = os.path.join(\n", + " \"exercises\", \"Tokyo2021\", \"CountryClusters.txt\"\n", + ")\n", "\n", "kh.extract_clusters(\n", " tokyo_cc_report,\n", @@ -250,7 +275,7 @@ }, "outputs": [], "source": [ - "tokyo_discipline_clusters_file = path.join(\n", + "tokyo_discipline_clusters_file = os.path.join(\n", " \"exercises\", \"Tokyo2021\", \"CountryClusters.txt\"\n", ")\n", "\n", @@ -259,7 +284,7 @@ " cluster_variable=\"Discipline\",\n", " clusters_file_path=tokyo_discipline_clusters_file,\n", ")\n", - "peek(tokyo_discipline_clusters_file, n=200, l=100)" + "peek(tokyo_discipline_clusters_file, n=200)" ] } ], @@ -283,5 +308,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Sklearn Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb b/Sklearn Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb index 9dd200b..61f107c 100644 --- a/Sklearn Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb +++ b/Sklearn Basics 1 - Train, Evaluate and Deploy a Classifier.ipynb @@ -7,7 +7,7 @@ "# Sklearn Basics 1: Train, Evaluate and Deploy a Classifier\n", "In this lesson, we will learn how to train, evaluate and deploy a classifier with Khiops sklearn.\n", "\n", - "We start by importing Khiops sklearn classifier `KhiopsClassifier` and saving the location of the Khiops `Samples` directory into a variable:" + "We start by importing the sklearn estimator `KhiopsClassifier`:" ] }, { @@ -16,14 +16,13 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", + "import os\n", "import pandas as pd\n", - "\n", "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", "\n", - "samples_dir = kh.get_samples_dir()\n", - "print(f\"Khiops samples directory located at {samples_dir}\")" + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -45,10 +44,10 @@ "metadata": {}, "outputs": [], "source": [ - "iris_data_file = path.join(samples_dir, \"Iris\", \"Iris.txt\")\n", - "print(\"\")\n", - "print(f\"Iris data: 10 first records\")\n", + "iris_data_file = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", "iris_df = pd.read_csv(iris_data_file, sep=\"\\t\")\n", + "\n", + "print(f\"Iris data: 10 first records\")\n", "iris_df.head()" ] }, @@ -135,10 +134,10 @@ }, "outputs": [], "source": [ - "adult_data_file = path.join(samples_dir, \"Adult\", \"Adult.txt\")\n", - "print(\"\")\n", - "print(f\"Adult data: 10 first records\")\n", + "adult_data_file = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "adult_df = pd.read_csv(adult_data_file, sep=\"\\t\")\n", + "\n", + "print(f\"Adult data: 10 first records\")\n", "adult_df.head()" ] }, @@ -160,9 +159,9 @@ "source": [ "X_adult_train = adult_df.drop([\"class\"], axis=1)\n", "y_adult_train = adult_df[\"class\"]\n", + "\n", "print(\"Adult dataset feature matrix (first 10 rows):\")\n", "display(X_adult_train.head(10))\n", - "print(\"\")\n", "print(\"Adult dataset target vector (first 10 values):\")\n", "display(y_adult_train.head(10))" ] @@ -441,5 +440,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb b/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb index d61bf26..b48355d 100644 --- a/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb +++ b/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb @@ -9,7 +9,7 @@ "In this notebook, we will learn how to train a classifier with a multi-table data composed of two tables (a root table and a secondary table). It is highly recommended to see the _Sklearn Basics 1_ lesson if you are not familiar with Khiops' sklearn estimators.\n", "\n", "\n", - "We start by importing Khiops sklearn classifier `KhiopsClassifier` and saving the location of the Khiops `Samples` directory into a variable:" + "We start by importing the sklearn estimator `KhiopsClassifier`:" ] }, { @@ -18,11 +18,15 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", + "import os\n", "import pandas as pd\n", - "\n", "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier" + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -79,13 +83,13 @@ "metadata": {}, "outputs": [], "source": [ - "sarcasm_dataset_dir = path.join(\"data\", \"HeadlineSarcasm\")\n", - "headlines_file = path.join(sarcasm_dataset_dir, \"Headlines.txt\")\n", + "sarcasm_dataset_dir = os.path.join(\"data\", \"HeadlineSarcasm\")\n", + "headlines_file = os.path.join(sarcasm_dataset_dir, \"Headlines.txt\")\n", "headlines_df = pd.read_csv(headlines_file, sep=\"\\t\")\n", "print(\"Headlines table (first 10 rows)\")\n", "display(headlines_df.head(10))\n", "\n", - "headlines_words_file = path.join(sarcasm_dataset_dir, \"HeadlineWords.txt\")\n", + "headlines_words_file = os.path.join(sarcasm_dataset_dir, \"HeadlineWords.txt\")\n", "headlines_words_df = pd.read_csv(headlines_words_file, sep=\"\\t\")\n", "print(\"HeadlineWords table (first 10 rows)\")\n", "display(headlines_words_df.head(10))" @@ -104,8 +108,8 @@ "metadata": {}, "outputs": [], "source": [ - "headlines_train_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n", - "y_sarcasm_train = headlines_df[\"IsSarcasm\"]" + "headlines_main_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n", + "y_sarcasm = headlines_df[\"IsSarcasm\"]" ] }, { @@ -136,15 +140,36 @@ "metadata": {}, "outputs": [], "source": [ - "X_sarcasm_train = {\n", + "X_sarcasm = {\n", " \"main_table\": \"headlines\",\n", " \"tables\": {\n", - " \"headlines\": (headlines_train_df, \"HeadlineId\"),\n", + " \"headlines\": (headlines_main_df, \"HeadlineId\"),\n", " \"headline_words\": (headlines_words_df, \"HeadlineId\"),\n", " },\n", "}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To separate this dataset into train and test, we user the `khiops-python` helper function `train_test_split_dataset`. This function allows to separate ``dict`` dataset specifications:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " X_sarcasm_train,\n", + " X_sarcasm_test,\n", + " y_sarcasm_train,\n", + " y_sarcasm_test,\n", + ") = train_test_split_dataset(X_sarcasm, y_sarcasm)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -194,7 +219,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we use our sarcasm classifier to obtain predictions on the training data. We normally do that on new test data, and again a multi-table dataset specification would have been needed." + "Now, we use our sarcasm classifier to obtain predictions and probabilities on the test data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_sarcasm_test_predicted = khc_sarcasm.predict(X_sarcasm_test)\n", + "probas_sarcasm_test = khc_sarcasm.predict_proba(X_sarcasm_test)\n", + "\n", + "print(\"HeadlineSarcasm test predictions (first 10 values):\")\n", + "display(y_sarcasm_test_predicted[:10])\n", + "print(\"HeadlineSarcasm test prediction probabilities (first 10 values):\")\n", + "display(probas_sarcasm_test[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally we may estimate the accuracy and AUC for the test data:" ] }, { @@ -203,9 +250,11 @@ "metadata": {}, "outputs": [], "source": [ - "sarcasm_predictions = khc_sarcasm.predict(X_sarcasm_train)\n", - "print(\"HeadlineSarcasm train predictions (first 10 values):\")\n", - "display(sarcasm_predictions[:10])" + "sarcasm_test_accuracy = metrics.accuracy_score(y_sarcasm_test, y_sarcasm_test_predicted)\n", + "sarcasm_test_auc = metrics.roc_auc_score(y_sarcasm_test, probas_sarcasm_test[:, 1])\n", + "\n", + "print(f\"Sarcasm test accuracy: {sarcasm_test_accuracy}\")\n", + "print(f\"Sarcasm test auc : {sarcasm_test_auc}\")" ] }, { @@ -244,16 +293,16 @@ }, "outputs": [], "source": [ - "accidents_dataset_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "\n", - "accidents_file = path.join(accidents_dataset_dir, \"Accidents.txt\")\n", - "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n", + "accidents_file = os.path.join(accidents_dataset_dir, \"Accidents.txt\")\n", + "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"latin1\")\n", "print(f\"Accidents dataframe (first 10 rows):\")\n", "display(accidents_df.head(10))\n", "print()\n", "\n", - "vehicles_file = path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", - "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n", + "vehicles_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", + "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"latin1\")\n", "print(f\"Vehicles dataframe (first 10 rows):\")\n", "display(vehicles_df.head(10))" ] @@ -276,7 +325,7 @@ "outputs": [], "source": [ "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n", - "y_accidents_train = accidents_df[\"Gravity\"]" + "y_accidents = accidents_df[\"Gravity\"]" ] }, { @@ -296,7 +345,7 @@ }, "outputs": [], "source": [ - "X_accidents_train = {\n", + "X_accidents = {\n", " \"main_table\": \"accidents\",\n", " \"tables\": {\n", " \"accidents\": (accidents_main_df, \"AccidentId\"),\n", @@ -305,6 +354,29 @@ "}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Split the dataset into train and test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "is_khiops_tutorial_solution": true + }, + "outputs": [], + "source": [ + "(\n", + " X_accidents_train,\n", + " X_accidents_test,\n", + " y_accidents_train,\n", + " y_accidents_test,\n", + ") = train_test_split_dataset(X_accidents, y_accidents)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -331,13 +403,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Print the accuracy and auc of the model\n" + "#### Print the train accuracy and auc of the model\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "is_khiops_tutorial_solution": true + }, "outputs": [], "source": [ "accidents_train_performance = (\n", @@ -351,9 +425,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Deploy the classifier to obtain predictions on the training data\n", + "#### Deploy the classifier to obtain predictions and its probabilites on the test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "is_khiops_tutorial_solution": true + }, + "outputs": [], + "source": [ + "y_accidents_test_predicted = khc_accidents.predict(X_accidents_test)\n", + "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)\n", "\n", - "*Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n" + "print(\"Accidents test predictions (first 10 values):\")\n", + "display(y_accidents_test_predicted[:10])\n", + "print(\"Accidentns test prediction probabilities (first 10 values):\")\n", + "display(probas_accidents_test[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Obtain the accuracy and AUC on the test dataset\n", + "\n" ] }, { @@ -364,7 +461,15 @@ }, "outputs": [], "source": [ - "khc_accidents.predict(X_accidents_train)" + "accidents_test_accuracy = metrics.accuracy_score(\n", + " y_accidents_test, y_accidents_test_predicted\n", + ")\n", + "accidents_test_auc = metrics.roc_auc_score(\n", + " y_accidents_test, probas_accidents_test[:, 1]\n", + ")\n", + "\n", + "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n", + "print(f\"Accidents test auc : {accidents_test_auc}\")" ] } ], @@ -389,4 +494,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb b/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb index 06bd3fa..8b3f816 100644 --- a/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb +++ b/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb @@ -8,7 +8,7 @@ "\n", "In this notebook, we will learn how to train a classifier with a more complex multi-table data where a secondary table is itself a parent tables of another table (ie. snowflake schema). It is highly recommended to see the _Sklearn Basics 1_ and _Sklearn Basics 2_ lessons if you are not familiar with Khiops' sklearn estimators.\n", "\n", - "We start by importing Khiops sklearn classifier `KhiopsClassifier` and saving the location of the Khiops `Samples` directory into a variable:" + "We start by importing the sklearn estimator `KhiopsClassifier`:" ] }, { @@ -17,11 +17,15 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", + "import os\n", "import pandas as pd\n", - "\n", "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier" + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -58,49 +62,30 @@ "metadata": {}, "outputs": [], "source": [ - "accidents_dataset_dir = path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", "\n", - "accidents_file = path.join(\n", - " path.join(kh.get_samples_dir(), \"AccidentsSummary\"), \"Accidents.txt\"\n", - ")\n", + "accidents_file = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\")\n", "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"latin1\")\n", "print(f\"Accident dataframe (first 10 rows):\")\n", "display(accidents_df.head(10))\n", "print()\n", "\n", - "vehicles_file = path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", + "vehicles_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"latin1\")\n", "print(f\"Vehicle dataframe (first 10 rows):\")\n", "display(vehicles_df.head(10))\n", "\n", "# We drop the \"Gravity\" column as it was used to create the target\n", - "users_file = path.join(accidents_dataset_dir, \"Users.txt\")\n", + "users_file = os.path.join(accidents_dataset_dir, \"Users.txt\")\n", "users_df = pd.read_csv(users_file, sep=\"\\t\", encoding=\"latin1\").drop(\"Gravity\", axis=1)\n", "print(f\"User dataframe (first 10 rows):\")\n", "display(users_df.head(10))\n", "print()\n", "\n", - "places_file = path.join(accidents_dataset_dir, \"Places.txt\")\n", + "places_file = os.path.join(accidents_dataset_dir, \"Places.txt\")\n", "places_df = pd.read_csv(places_file, sep=\"\\t\", encoding=\"latin1\")\n", "print(f\"Places dataframe (first 10 rows):\")\n", - "display(places_df.head(10))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create the main feature matrix and the target vector for `Accidents`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n", - "y_accidents_train = accidents_df[\"Gravity\"]" + "display(places_df.head(10))" ] }, { @@ -112,8 +97,7 @@ "Note the main table `Accidents` and the secondary table `Places` have one key `AccidentId`.\n", "Tables `Vehicles` (the other secondary table) and `Users` (the tertiary table) have two keys: `AccidentId` and `VehicleId`.\n", "\n", - "To describe relations between tables, the field `relations` must be added to the dictionary of table specifications. This field\n", - "contains a list of tuples describing the relations between tables. The first two values (`str`) of each tuple correspond to names of both the parent and the child table involved in the relation. A third value (`bool`) can be optionally set as `True` to indicate that the relation is `1:1`. For example, if the tuple `(table1, table2, True)` is contained in this field, it means that:\n", + "To describe relations between tables, we add the `relations` field must to the dataset spec. This field contains a list of tuples describing the relations between tables. The first two values (`str`) of each tuple correspond to names of both the parent and the child table involved in the relation. A third value (`bool`) can be optionally set as `True` to indicate that the relation is `1:1`. For example, if the tuple `(table1, table2, True)` is contained in this field, it means that:\n", "\n", " - `table1` and `table2` are in a `1:1` relationship\n", " - The key of `table1` is contained in that of `table2` (ie. keys are hierarchical)\n", @@ -127,22 +111,43 @@ "metadata": {}, "outputs": [], "source": [ - "X_accidents_train = {\n", + "X_accidents = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (accidents_main_df, \"AccidentId\"),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n", " \"Places\": (places_df, [\"AccidentId\"]),\n", - "\n", " },\n", " \"relations\": [\n", " (\"Accidents\", \"Vehicles\"),\n", " (\"Vehicles\", \"Users\"),\n", " (\"Accidents\", \"Places\", True),\n", - "\n", " ],\n", - "}" + "}\n", + "y_accidents = accidents_df[\"Gravity\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Split the dataset into train and test\n", + "We use the helper function `train_test_split_dataset` with the `X` dataset spec to obtain one spec for train and another for test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " X_accidents_train,\n", + " X_accidents_test,\n", + " y_accidents_train,\n", + " y_accidents_test,\n", + ") = train_test_split_dataset(X_accidents, y_accidents, test_size=0.3)" ] }, { @@ -169,7 +174,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Print the accuracy and auc of the model\n" + "#### Print the train accuracy and train auc of the model" ] }, { @@ -189,20 +194,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Deploy the classifier to obtain predictions on the training data\n", - "\n", - "Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n" + "#### Deploy the classifier to obtain predictions and probabilities on the test data" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "is_khiops_tutorial_solution": true - }, + "metadata": {}, "outputs": [], "source": [ - "khc_accidents.predict(X_accidents_train)" + "y_accidents_test_predicted = khc_accidents.predict(X_accidents_test)\n", + "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)\n", + "\n", + "print(\"Accidents test predictions (first 10 values):\")\n", + "display(y_accidents_test_predicted[:10])\n", + "print(\"Accidentns test prediction probabilities (first 10 values):\")\n", + "display(probas_accidents_test[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Estimate the accuracy and AUC metrics on the test data" ] }, { @@ -210,13 +224,23 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "accidents_test_accuracy = metrics.accuracy_score(\n", + " y_accidents_test, y_accidents_test_predicted\n", + ")\n", + "accidents_test_auc = metrics.roc_auc_score(\n", + " y_accidents_test, probas_accidents_test[:, 1]\n", + ")\n", + "\n", + "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n", + "print(f\"Accidents test auc : {accidents_test_auc}\")" + ] } ], "metadata": { "celltoolbar": "Edit Metadata", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -229,10 +253,9 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" + "pygments_lexer": "ipython3" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/Sklearn Basics 4 - Train a Coclustering.ipynb b/Sklearn Basics 4 - Train a Coclustering.ipynb index b6d77e3..f0f3ba2 100644 --- a/Sklearn Basics 4 - Train a Coclustering.ipynb +++ b/Sklearn Basics 4 - Train a Coclustering.ipynb @@ -7,7 +7,7 @@ "# Sklearn Basics 4: Train a Coclustering\n", "The steps to train a coclustering model with Khiops are very similar to what we have already seen in the basic classifier tutorials.\n", "\n", - "We start by importing `KhiopsCoclustering` estimators and some helper functions:" + "We start by importing the sklearn estimator `KhiopsCoclustering` and defining a helper function:" ] }, { @@ -16,12 +16,26 @@ "metadata": {}, "outputs": [], "source": [ - "from os import path\n", - "import numpy as np\n", + "import os\n", + "import platform\n", + "import subprocess\n", "import pandas as pd\n", - "from helper_functions import explorer_open\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsCoclustering\n", "\n", - "from khiops.sklearn import KhiopsCoclustering" + "\n", + "def os_open(path):\n", + " \"\"\"Opens a file or directory with its default application\"\"\"\n", + " if platform.system() == \"Windows\":\n", + " os.startfile(path)\n", + " elif platform.system() == \"Darwin\":\n", + " subprocess.call([\"open\", path])\n", + " else:\n", + " subprocess.call([\"xdg-open\", path])\n", + "\n", + "\n", + "# If there are any issues you may Khiops status with the following command\n", + "# kh.get_runner().print_status()" ] }, { @@ -44,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "countries_data_file = path.join(\n", + "countries_data_file = os.path.join(\n", " \"data\", \"CountriesByOrganization\", \"CountriesByOrganization.csv\"\n", ")\n", "X_countries = pd.read_csv(countries_data_file, sep=\";\")\n", @@ -124,7 +138,7 @@ "metadata": {}, "outputs": [], "source": [ - "countries_report = path.join(\"exercises\", \"countries.khcj\")\n", + "countries_report = os.path.join(\"exercises\", \"countries.khcj\")\n", "khcc_countries.export_report_file(countries_report)\n", "# explorer_open(countries_report)" ] @@ -163,11 +177,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "is_khiops_tutorial_solution": true + }, "outputs": [], "source": [ - "tokyo_data_file = path.join(\"data\", \"Tokyo2021\", \"Athletes.csv\")\n", - "X_tokyo = pd.read_csv(tokyo_data_file, encoding=\"ISO-8859-1\")\n", + "tokyo_data_file = os.path.join(\"data\", \"Tokyo2021\", \"Athletes.csv\")\n", + "X_tokyo = pd.read_csv(tokyo_data_file, encoding=\"latin1\")\n", "print(\"Tokyo2021 dataset (first 10 rows):\")\n", "display(X_tokyo.head(10))" ] @@ -186,7 +202,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "is_khiops_tutorial_solution": true + }, "outputs": [], "source": [ "khcc_tokyo = KhiopsCoclustering()\n", @@ -213,7 +231,7 @@ ").clusters\n", "tokyo_leaf_clusters = [cluster for cluster in tokyo_clusters if cluster.is_leaf]\n", "print(f\"Number of leaf clusters: {len(tokyo_leaf_clusters)}:\")\n", - "for index, cluster in enumerate(tokyo_leaf_clusters, start=1):\n", + "for index, cluster in enumerate(tokyo_leaf_clusters):\n", " print(f\"cluster {index:02d}: {cluster.name}\")" ] }, @@ -232,8 +250,8 @@ }, "outputs": [], "source": [ - "print(f\"Members of the cluster {tokyo_leaf_clusters[4].name}:\")\n", - "for value_obj in tokyo_leaf_clusters[4].leaf_part.values:\n", + "print(f\"Members of the cluster {tokyo_leaf_clusters[29].name}:\")\n", + "for value_obj in tokyo_leaf_clusters[29].leaf_part.values:\n", " print(value_obj.value)" ] }, @@ -252,9 +270,11 @@ }, "outputs": [], "source": [ - "tokyo_report = path.join(\"exercises\", \"tokyo.khcj\")\n", + "tokyo_report = os.path.join(\"exercises\", \"tokyo.khcj\")\n", "khcc_tokyo.export_report_file(tokyo_report)\n", - "# explorer_open(tokyo_report)" + "\n", + "# To visualize uncomment the line below\n", + "# os_open(tokyo_report)" ] }, { @@ -298,5 +318,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/create-coursework.py b/create-coursework.py index 85b4a95..a6934f4 100644 --- a/create-coursework.py +++ b/create-coursework.py @@ -14,9 +14,6 @@ def main(): os.mkdir(coursework_dir_path) print("Creating resources ...") - helper_functions_path = os.path.join(".", "./helper_functions.py") - shutil.copy(helper_functions_path, coursework_dir_path) - data_dir_path = os.path.join(".", "data") coursework_data_dir_path = os.path.join(coursework_dir_path, "data") shutil.copytree(data_dir_path, coursework_data_dir_path) diff --git a/helper_functions.py b/helper_functions.py deleted file mode 100644 index c0c6c07..0000000 --- a/helper_functions.py +++ /dev/null @@ -1,26 +0,0 @@ -import platform -import os -import subprocess - - -def peek(filePath, n=10, l=80): - """Shows the first n lines of a file with a maximum of l columns""" - with open(filePath, encoding="utf8", errors="replace") as file: - for i, line in enumerate(file): - if i > n: - print("...") - break - print(line.rstrip()[0:l], end="") - if len(line) > l: - print(" ...", end="") - print("") - - -def explorer_open(file_path): - if platform.system() == "Windows": - os.startfile(file_path) - else: - if platform.system() == "Darwin": - subprocess.call("open", file_path) - else: - subprocess.call("xdg-open", file_path)