diff --git a/.github/workflows/mkdocs-release.yml b/.github/workflows/mkdocs-release.yml
new file mode 100644
index 000000000..f625ab41d
--- /dev/null
+++ b/.github/workflows/mkdocs-release.yml
@@ -0,0 +1,40 @@
+name: mkdocs-release
+
+on:
+ push:
+ branches: [ branch-* ]
+
+jobs:
+ publish-release:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Checkout hopsworks-tutorials
+ uses: actions/checkout@v3
+ with:
+ repository: logicalclocks/hopsworks-tutorials
+ path: docs/hopsworks-tutorials
+ fetch-depth: 0
+
+ - uses: actions/setup-python@v2
+ with:
+ python-version: '3.8'
+
+ - name: install deps
+ run: pip3 install pip==22.0.3 && pip3 install 'git+https://github.com/logicalclocks/feature-store-api@master#egg=hsfs[docs]&subdirectory=python'
+
+ - name: copy tutorial images
+ run: python prepare_images.py
+
+ - name: setup git
+ run: |
+ git config --global user.name Mike
+ git config --global user.email mike@docs.hopsworks.ai
+
+ # Put this back and increment version when cutting a new release branch
+ # - name: mike deploy docs
+ # run: mike deploy 3.0 latest -u --push
diff --git a/.github/workflows/mkdocs-test.yml b/.github/workflows/mkdocs-test.yml
new file mode 100644
index 000000000..1e638e725
--- /dev/null
+++ b/.github/workflows/mkdocs-test.yml
@@ -0,0 +1,38 @@
+name: mkdocs-test
+
+on: pull_request
+
+jobs:
+ test-docs-build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout main repo
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Checkout hopsworks-tutorials
+ uses: actions/checkout@v3
+ with:
+ repository: logicalclocks/hopsworks-tutorials
+ path: docs/hopsworks-tutorials
+ fetch-depth: 0
+
+ - uses: actions/setup-python@v2
+ with:
+ python-version: '3.8'
+
+ - name: install deps
+ run: pip3 install pip==22.0.3 && pip3 install 'git+https://github.com/logicalclocks/feature-store-api@master#egg=hsfs[docs]&subdirectory=python'
+
+ - name: copy tutorial images
+ run: python prepare_images.py
+
+ - name: setup git
+ run: |
+ git config --global user.name Mike
+ git config --global user.email mike@docs.hopsworks.ai
+
+ - name: mike deploy docs
+ run: mike deploy 3.1-SNAPSHOT dev -u
diff --git a/.gitignore b/.gitignore
index add71dfa6..d90b8f4b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,6 +109,7 @@ venv.bak/
# mkdocs documentation
site/
+docs/hopsworks-tutorials/
# mypy
.mypy_cache/
diff --git a/docs/getting_started/images/01_featuregroups.png b/docs/getting_started/images/01_featuregroups.png
deleted file mode 100644
index c2d8b0a4b..000000000
Binary files a/docs/getting_started/images/01_featuregroups.png and /dev/null differ
diff --git a/docs/getting_started/images/02_training-dataset.png b/docs/getting_started/images/02_training-dataset.png
deleted file mode 100644
index f8954e0ed..000000000
Binary files a/docs/getting_started/images/02_training-dataset.png and /dev/null differ
diff --git a/docs/getting_started/images/03_model.png b/docs/getting_started/images/03_model.png
deleted file mode 100644
index 2bb98ed29..000000000
Binary files a/docs/getting_started/images/03_model.png and /dev/null differ
diff --git a/docs/getting_started/images/fg_overview.gif b/docs/getting_started/images/fg_overview.gif
deleted file mode 100644
index e6bd61123..000000000
Binary files a/docs/getting_started/images/fg_overview.gif and /dev/null differ
diff --git a/docs/getting_started/images/fg_statistics.gif b/docs/getting_started/images/fg_statistics.gif
deleted file mode 100644
index 1c8fe0884..000000000
Binary files a/docs/getting_started/images/fg_statistics.gif and /dev/null differ
diff --git a/docs/getting_started/images/fv_overview.gif b/docs/getting_started/images/fv_overview.gif
deleted file mode 100644
index 5dd77672e..000000000
Binary files a/docs/getting_started/images/fv_overview.gif and /dev/null differ
diff --git a/docs/getting_started/images/icon102.png b/docs/getting_started/images/icon102.png
deleted file mode 100755
index bebb5b1cd..000000000
Binary files a/docs/getting_started/images/icon102.png and /dev/null differ
diff --git a/docs/getting_started/images/provenance.gif b/docs/getting_started/images/provenance.gif
deleted file mode 100644
index 4ad190fa2..000000000
Binary files a/docs/getting_started/images/provenance.gif and /dev/null differ
diff --git a/docs/getting_started/images/serving_endpoints.gif b/docs/getting_started/images/serving_endpoints.gif
deleted file mode 100644
index de46c5b4f..000000000
Binary files a/docs/getting_started/images/serving_endpoints.gif and /dev/null differ
diff --git a/docs/getting_started/quickstart.ipynb b/docs/getting_started/quickstart.ipynb
deleted file mode 100644
index 2bda7993b..000000000
--- a/docs/getting_started/quickstart.ipynb
+++ /dev/null
@@ -1,1587 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8WLB6QFXksxw"
- },
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "Ovynr30zkqnO",
- "outputId": "ab854e5a-4401-45b4-e834-1e65335dad6d"
- },
- "outputs": [],
- "source": [
- "!pip install -U hopsworks==3.0.0rc5 --quiet"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "NpwpPe1wxQ5M"
- },
- "source": [
- "## 💽 Loading the Data\n",
- "\n",
- "The data you will use comes from three different CSV files:\n",
- "\n",
- "* credit_cards.csv: information such as the expiration date and provider.\n",
- "* transactions.csv: events containing information about when a credit card was used, such as a timestamp, location, and the amount spent. A boolean fraud_label variable (True/False) tells us whether a transaction was fraudulent or not.\n",
- "* profiles.csv: credit card user information such as birthdate and city of residence.\n",
- "\n",
- "In a production system, these CSV files would originate from separate data sources or tables, and probably separate data pipelines. All three files have a common credit card number column cc_num, which you will use later to join features together from the different datasets.\n",
- "\n",
- "Now, you can go ahead and load the data.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 143
- },
- "id": "ARrJ_Bp5xMIk",
- "outputId": "14e7a020-e04a-40d5-fdea-c4ba71f8a034"
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "window_len = \"4h\"\n",
- "url = \"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "credit_cards_df = pd.read_csv(url + \"credit_cards.csv\")\n",
- "credit_cards_df.head(3)\n",
- "\n",
- "profiles_df = pd.read_csv(url + \"profiles.csv\", parse_dates=[\"birthdate\"])\n",
- "profiles_df.head(3)\n",
- "\n",
- "trans_df = pd.read_csv(url + \"transactions.csv\", parse_dates=[\"datetime\"])\n",
- "trans_df.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "HPq2qUtNxjaM"
- },
- "source": [
- "\n",
- "🛠️ Feature Engineering\n",
- "\n",
- "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning, we will create additional features based on these patterns. In particular, we will create two types of features:\n",
- "\n",
- "* Features that aggregate data from different data sources. This could for instance be the age of a customer at the time of a transaction, which combines the birthdate feature from profiles.csv with the datetime feature from transactions.csv.\n",
- "* Features that aggregate data from multiple time steps. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.\n",
- "\n",
- "Now you are ready to start with the first category.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- },
- "id": "ngEPnNzAxqsJ",
- "outputId": "c8cf6082-1d1d-4bf7-9d81-ac2294146a27"
- },
- "outputs": [],
- "source": [
- "# Compute age at transaction.\n",
- "age_df = trans_df.merge(profiles_df, on=\"cc_num\", how=\"left\")\n",
- "trans_df[\"age_at_transaction\"] = (age_df[\"datetime\"] - age_df[\"birthdate\"]) / np.timedelta64(1, \"Y\")\n",
- "\n",
- "# Compute days until card expires.\n",
- "card_expiry_df = trans_df.merge(credit_cards_df, on=\"cc_num\", how=\"left\")\n",
- "card_expiry_df[\"expires\"] = pd.to_datetime(card_expiry_df[\"expires\"], format=\"%m/%y\")\n",
- "trans_df[\"days_until_card_expires\"] = (card_expiry_df[\"expires\"] - card_expiry_df[\"datetime\"]) / np.timedelta64(1, \"D\")\n",
- "\n",
- "trans_df[[\"age_at_transaction\", \"days_until_card_expires\"]].head()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "zEC12W4ux2Uk"
- },
- "source": [
- "The next step is that you will create features from aggregations that are computed over every credit card over multiple time steps.\n",
- "\n",
- "You start by computing a feature that captures the physical distance between consecutive transactions, which we will call `loc_delta`. Here, you will use Haversine distance to quantify the distance between two longitude and latitude coordinates.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "rQ-g4ETOx4O5"
- },
- "outputs": [],
- "source": [
- "from math import radians\n",
- "\n",
- "# Do some simple preprocessing.\n",
- "trans_df.sort_values(\"datetime\", inplace=True)\n",
- "trans_df[[\"longitude\", \"latitude\"]] = trans_df[[\"longitude\", \"latitude\"]].applymap(radians)\n",
- "\n",
- "def haversine(long, lat):\n",
- " \"\"\"Compute Haversine distance between each consecutive coordinate in (long, lat).\"\"\"\n",
- "\n",
- " long_shifted = long.shift()\n",
- " lat_shifted = lat.shift()\n",
- " long_diff = long_shifted - long\n",
- " lat_diff = lat_shifted - lat\n",
- "\n",
- " a = np.sin(lat_diff/2.0)**2\n",
- " b = np.cos(lat) * np.cos(lat_shifted) * np.sin(long_diff/2.0)**2\n",
- " c = 2*np.arcsin(np.sqrt(a + b))\n",
- "\n",
- " return c\n",
- "\n",
- "\n",
- "trans_df[\"loc_delta\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : haversine(x[\"longitude\"], x[\"latitude\"]))\\\n",
- " .reset_index(level=0, drop=True)\\\n",
- " .fillna(0)\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "a_MHfwYsGfbo"
- },
- "source": [
- "Next you will compute windowed aggregates. Here you will use 4-hour windows, but feel free to experiment with different window lengths by setting `window_len` below to a value of your choice."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- },
- "id": "jmywmIVKGgLR",
- "outputId": "32ad2881-9a27-483f-c8e4-9ef94af6dd6e"
- },
- "outputs": [],
- "source": [
- "cc_group = trans_df[[\"cc_num\", \"amount\", \"datetime\"]].groupby(\"cc_num\").rolling(window_len, on=\"datetime\")\n",
- "\n",
- "# Moving average of transaction volume.\n",
- "df_4h_mavg = pd.DataFrame(cc_group.mean())\n",
- "df_4h_mavg.columns = [\"trans_volume_mavg\", \"datetime\"]\n",
- "df_4h_mavg = df_4h_mavg.reset_index(level=[\"cc_num\"])\n",
- "df_4h_mavg = df_4h_mavg.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_mavg = df_4h_mavg.sort_index()\n",
- "\n",
- "# Moving standard deviation of transaction volume.\n",
- "df_4h_std = pd.DataFrame(cc_group.mean())\n",
- "df_4h_std.columns = [\"trans_volume_mstd\", \"datetime\"]\n",
- "df_4h_std = df_4h_std.reset_index(level=[\"cc_num\"])\n",
- "df_4h_std = df_4h_std.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_std = df_4h_std.fillna(0)\n",
- "df_4h_std = df_4h_std.sort_index()\n",
- "window_aggs_df = df_4h_std.merge(df_4h_mavg,left_index=True, right_index=True)\n",
- "\n",
- "# Moving average of transaction frequency.\n",
- "df_4h_count = pd.DataFrame(cc_group.mean())\n",
- "df_4h_count.columns = [\"trans_freq\", \"datetime\"]\n",
- "df_4h_count = df_4h_count.reset_index(level=[\"cc_num\"])\n",
- "df_4h_count = df_4h_count.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_count = df_4h_count.sort_index()\n",
- "window_aggs_df = window_aggs_df.merge(df_4h_count,left_index=True, right_index=True)\n",
- "\n",
- "# Moving average of location difference between consecutive transactions.\n",
- "cc_group = trans_df[[\"cc_num\", \"loc_delta\", \"datetime\"]].groupby(\"cc_num\").rolling(window_len, on=\"datetime\").mean()\n",
- "df_4h_loc_delta_mavg = pd.DataFrame(cc_group)\n",
- "df_4h_loc_delta_mavg.columns = [\"loc_delta_mavg\", \"datetime\"]\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.reset_index(level=[\"cc_num\"])\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.sort_index()\n",
- "window_aggs_df = window_aggs_df.merge(df_4h_loc_delta_mavg,left_index=True, right_index=True)\n",
- "\n",
- "window_aggs_df = window_aggs_df.merge(trans_df[[\"cc_num\", \"datetime\"]].sort_index(),left_index=True, right_index=True)\n",
- "window_aggs_df.tail()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "yB90r9qszLe2"
- },
- "source": [
- "## 🪄 Creating Feature Groups\n",
- "\n",
- "A feature group can be seen as a collection of conceptually related features that are computed together at the same cadence. In your case, you will create a feature group for the transaction data and a feature group for the windowed aggregations on the transaction data. Both will have `tid` as primary key, which will allow you to join them together to create training data in a follow-on tutorial.\n",
- "\n",
- "Feature groups provide a namespace for features, so two features are allowed to have the same name as long as they belong to different feature groups. For instance, in a real-life setting we would likely want to experiment with different window lengths. In that case, we can create feature groups with identical schema for each window length.\n",
- "\n",
- "Before you can create a feature group we need to connect to our feature store.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "WFmD_15TzMHX",
- "outputId": "6acf8632-6993-485c-fb2a-31f27f7b462f"
- },
- "outputs": [],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "EdpXZQnxzWkd"
- },
- "source": [
- "To create a feature group we need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to 1."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "kDUHXzJ3zXxN"
- },
- "outputs": [],
- "source": [
- "trans_fg = fs.get_or_create_feature_group(\n",
- " name=\"transactions\",\n",
- " version=\"1\",\n",
- " description=\"Transaction data\",\n",
- " primary_key=['cc_num'],\n",
- " event_time=['datetime']\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "J3ZePNxmzbdn"
- },
- "source": [
- "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai).\n",
- "\n",
- "At this point, we have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent we populate it with its associated data.\n",
- "\n",
- "If you have previously inserted this data into *trans_fg*, you can skip this step.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "ptNT4PpPzcfV",
- "outputId": "f432bd20-558a-4c62-e967-f96a2100f3b0"
- },
- "outputs": [],
- "source": [
- "trans_fg.insert(trans_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "KzmLXQqizoZ4"
- },
- "source": [
- "We can move on and do the same thing for the feature group with our windows aggregation."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PCE0H2TxzruP"
- },
- "outputs": [],
- "source": [
- "window_aggs_fg = fs.get_or_create_feature_group(\n",
- " name=f\"transactions_{window_len}_aggs\",\n",
- " version=1,\n",
- " description=f\"Aggregate transaction data over {window_len} windows.\",\n",
- " primary_key=['cc_num'],\n",
- " event_time=['datetime']\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "TYk1qaE7zu0x",
- "outputId": "a1c94e20-8718-456f-cc66-de655958b311"
- },
- "outputs": [],
- "source": [
- "window_aggs_fg.insert(window_aggs_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "83QAknPe0Jui"
- },
- "source": [
- "Click on the hyperlink printed in the cell output above to inspect your feature group in the UI."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Kp5O29jQ0YrY"
- },
- "source": [
- "## 🔪 Feature Selection\n",
- "\n",
- "We start by selecting all the features we want to include for model training/inference.\n",
- "\n",
- "Hopsworks provides a simple DSL (domain specific language) for joining together features from different feature groups. You use the `select()/select_all()/select_except()` feature group methods to select features (from that feature group), and the `join()` method to join together features from a different feature group."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 347
- },
- "id": "DpM-7xwj0N4i",
- "outputId": "1c7463e4-3ba5-440b-98f6-9c09995c3d76"
- },
- "outputs": [],
- "source": [
- "# Select features for training data.\n",
- "ds_query = trans_fg.select([\"fraud_label\", \"category\", \"amount\", \"age_at_transaction\", \"days_until_card_expires\", \"loc_delta\"])\\\n",
- " .join(window_aggs_fg.select_except([\"cc_num\"]), on=\"cc_num\")\\\n",
- "\n",
- "# Uncomment, if you want to inspect the data at this point\n",
- "#ds_query.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "fh8y7CWg0di9"
- },
- "source": [
- "### What if features being joined together have the same name (but come from different feature groups)?\n",
- "\n",
- "The problem of name clashes when joining features together can easily happen. For example, recall that you computed the features in transactions_4h_aggs using 4-hour aggregates. If you had created another feature group for 12-hour aggregates, you may have designed an identical schema with the same feature names (just for 12-hours, not 4-hours). If you join features together with identical names from different feature groups, you should pass a prefix argument (e.g., `prefix='4hr'`) in the join operation to give the features unique names in the join object. See the [documentation](https://docs.hopsworks.ai) for more details.\n",
- "\n",
- "## 🤖 Transformation Functions\n",
- "\n",
- "You will preprocess our data using min-max scaling on numerical features and label encoding on categorical features. To do this, for each feature, you can define a one-to-one mapping between your feature and a transformation function. This ensures that transformation functions such as *min-max scaling* are fitted only on the train set (and not the validation/test set), which ensures that there is no leakage into the test set during training. The transformation functions are also needed for model inference pipelines, where pre-computed features are retrieved from the feature store and the same transformation functions (with the same state/parameters computed on the train set) are applied on the features in serving (inference pipelines).\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "MbIo70e50VgP"
- },
- "outputs": [],
- "source": [
- "# Load the transformation functions.\n",
- "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n",
- "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
- "\n",
- "# Map features to transformation functions.\n",
- "transformation_functions = {\n",
- " \"category\": label_encoder,\n",
- " \"amount\": min_max_scaler,\n",
- " \"trans_volume_mavg\": min_max_scaler,\n",
- " \"trans_volume_mstd\": min_max_scaler,\n",
- " \"trans_freq\": min_max_scaler,\n",
- " \"loc_delta\": min_max_scaler,\n",
- " \"loc_delta_mavg\": min_max_scaler,\n",
- " \"age_at_transaction\": min_max_scaler,\n",
- " \"days_until_card_expires\": min_max_scaler,\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "1tIvbPlJ0xJQ"
- },
- "source": [
- "## ⚙️ Feature View Creation\n",
- "\n",
- "The Feature View is the collection of features (from feature groups) and transformation functions used to train models and serve precomputed features to deployed models.\n",
- "\n",
- "The Feature View includes all of the features defined in the query object you created earlier. It can additionally include filters, one or more columns identified as the target(s) (or label) and the set of transformation functions and the features they are applied to. \n",
- "\n",
- "You create a Feature View with `fs.create_feature_view()`. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "TantyGyl0zqc",
- "outputId": "1d20ea2d-b044-47dd-bb9a-b93a677ed7ac"
- },
- "outputs": [],
- "source": [
- "feature_view = fs.create_feature_view(\n",
- " name='transactions_view',\n",
- " query=ds_query,\n",
- " version=1,\n",
- " labels=[\"fraud_label\"],\n",
- " transformation_functions=transformation_functions\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can retrieve a reference to an existing feature view with: `fs.get_feature_view('transactions_view')`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_view = fs.get_feature_view('transactions_view', version=1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "hzwtiWqC1Gs5"
- },
- "source": [
- "## 🏋️ Training Dataset Creation\n",
- "\n",
- "In Hopsworks, training data is the data from a set of features that is used to train a model. The FeatureView provides the set of features for the training data, but you can apply filters to select a subset of data from the data available for those features. For example, you could select training data for a given time range, or data only for users based on a particular geographic region. \n",
- "Training data can be read as *in-memory Pandas DataFrames* or as (materialized) *files (.csv, .tfrecord, etc)* on disk. Training data is immutable and versioned. This makes training data also reproducible - you can delete the training data files and reproduce the exact same training data at a later point in time.\n",
- "\n",
- "Training data may be read with user-defined splits, such as:\n",
- "\n",
- "* Training set - the subset of training data used to train a model.\n",
- "* Validation set - the subset of training data used to evaluate hparams when training a model\n",
- "* Test set - the holdout subset of training data used to evaluate a model\n",
- "\n",
- "You use a FeatureView object to create training data, and there are different methods for reading training data as Pandas DataFrames or as creating training data as files. You can provide time filters specifing the start_time and end_time for training data.\n",
- "\n",
- "For example, this shows you how to create training data as files:\n",
- "\n",
- " from datetime import datetime\n",
- " date_format = \"%Y-%m-%d %H:%M:%S\"\n",
- "\n",
- " start_time = int(float(datetime.strptime(\"2022-01-01 00:00:01\", date_format).timestamp()) * 1000)\n",
- " end_time = int(float(datetime.strptime(\"2022-02-28 23:59:59\", date_format).timestamp()) * 1000)\n",
- "\n",
- " td_version, td_job = feature_view.create_training_data(\n",
- " description = 'transactions_dataset_jan_feb',\n",
- " data_format = 'csv',\n",
- " write_options = {'wait_for_job': True},\n",
- " coalesce = True,\n",
- " start_time = start_time,\n",
- " end_time = end_time,\n",
- " )\n",
- "\n",
- "You can then retrieve the training data from the files using:\n",
- "\n",
- " X_train, y_train, X_test, y_test = feature_view.get_train_test_split(0.2)\n",
- "\n",
- "If you want to read the train/test set as Pandas DataFrames, use the call below which does not save training data as files, but stores metadata in the feature store that a version of training data was created for the feature view.\n",
- "\n",
- "🧬 Read train/test splits from a feature view"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "NGMyGRTnbQol",
- "outputId": "49abbc54-c0ba-4e30-e9bc-91fd5342fe4e"
- },
- "outputs": [],
- "source": [
- "X_train, y_train, X_test, y_test = feature_view.train_test_split(0.2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- },
- "id": "RIidxsXb1hG3",
- "outputId": "8d04d864-2be9-47d4-eae1-00e754b9897f"
- },
- "outputs": [],
- "source": [
- "X_train.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- },
- "id": "LQ0F4B5s1lJh",
- "outputId": "5771938f-1031-43c9-9b97-66cf0d3e6433"
- },
- "outputs": [],
- "source": [
- "y_train.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "cpMXAUarmUEX"
- },
- "source": [
- "Let's check the distribution of our target label."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "7yOvZZZNmW91",
- "outputId": "4a20f728-708f-468e-eeae-39b8beb759bd"
- },
- "outputs": [],
- "source": [
- "y_train.value_counts(normalize=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "uXUWJXAzmaV7"
- },
- "source": [
- "Notice that the distribution is extremely skewed, which is natural considering that fraudulent transactions make up a tiny part of all transactions. Thus you should somehow address the class imbalance. There are many approaches for this, such as weighting the loss function, over- or undersampling, creating synthetic data, or modifying the decision threshold. In this example, you'll use the simplest method which is to just supply a class weight parameter to our learning algorithm. The class weight will affect how much importance is attached to each class, which in your case means that higher importance will be placed on positive (fraudulent) samples."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "LxZfFvUDmdao"
- },
- "source": [
- "## 🧬 Train Model\n",
- "\n",
- "Next you'll train a model. Here, you set the class weight of the positive class to be twice as big as the negative class.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "2tXdVK91md5n",
- "outputId": "049e125b-a49a-45df-b69e-f24226c3faeb"
- },
- "outputs": [],
- "source": [
- "from sklearn.linear_model import LogisticRegression\n",
- "\n",
- "pos_class_weight = 0.9\n",
- "clf = LogisticRegression(class_weight={0: 1.0 - pos_class_weight, 1: pos_class_weight}, solver='liblinear')\n",
- "clf.fit(X_train, y_train.values.ravel())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "CWQlec77mkcZ"
- },
- "source": [
- "Let's see how well it performs on your test data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "qQK-dCZemmpe",
- "outputId": "e054c772-d590-4fe7-992c-fa4db72cbf00"
- },
- "outputs": [],
- "source": [
- "from sklearn.metrics import classification_report\n",
- "import pprint\n",
- "\n",
- "preds = clf.predict(X_test)\n",
- "\n",
- "report_dict = classification_report(y_test, preds, output_dict=True)\n",
- "pprint.pprint(report_dict, width=2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Mw20d2ehr5xt"
- },
- "source": [
- "Pickle the model and write it to a local folder."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "iLZExWW5oFbG",
- "outputId": "e3b27efe-52ab-48d0-8177-7a328c564900"
- },
- "outputs": [],
- "source": [
- "import joblib\n",
- "\n",
- "joblib.dump(clf, 'model.pkl')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "mr = project.get_model_registry()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "hvv1js3KczHh"
- },
- "source": [
- "## 📝 Export Model\n",
- "\n",
- "Next you will export the model and attach additional information like the signature of the inputs/predictions, a concrete input example and evaluation metrics. In the last line, you upload the pickled model file to the Model Registry."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "xjQ5JlMqsWdv",
- "outputId": "46cd5673-0100-4bf7-a820-5b099286a28d"
- },
- "outputs": [],
- "source": [
- "from hsml.schema import Schema\n",
- "from hsml.model_schema import ModelSchema\n",
- "\n",
- "input_schema = Schema(X_test)\n",
- "output_schema = Schema(y_test)\n",
- "\n",
- "fraud_model = mr.sklearn.create_model(\"fraud2\",\n",
- " metrics={'accuracy': report_dict['accuracy']},\n",
- " input_example=X_test.head(1), \n",
- " model_schema=ModelSchema(input_schema=input_schema, output_schema=output_schema))\n",
- "\n",
- "fraud_model.save('model.pkl')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-dX28wxChHQX"
- },
- "source": [
- "## 🤖 Create Deployment\n",
- "\n",
- "Next step is to deploy the model on KServe behind Hopsworks for real-time inference requests."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 84,
- "referenced_widgets": [
- "43c0b243b2c9417abc5072f82ba22457",
- "4e326043046247f5ae5bfac3306cb145",
- "a16b8eae9f614a44a9ee7f7ea900d1f2",
- "a0e314c90c5a413d9701a6efcded884b",
- "24536f939afa41f1acff4e55fae4423c",
- "a26b3a1aa96f4eeaaad89d4d662fa010",
- "ed9e0fc80c9e483cb7aafad007b57ea0",
- "7887571c2ea4415080f60ea18be441b1",
- "4559fa2eaf7348ae9dd5d1cfdd3e0bd5",
- "7529396c14464b7b9e349c6ba003cddc",
- "71ad3ff88d62426abda44fdb300b0484"
- ]
- },
- "id": "MHq7yu99sG3x",
- "outputId": "119cbcf2-bf34-49c0-841b-d27c7cfc1e83"
- },
- "outputs": [],
- "source": [
- "deployment = fraud_model.deploy()\n",
- "deployment.start()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can retrieve a reference to the model with mr.get_model(..) and a reference to the deployed model with `project.get_model_deployment(..)`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ms = project.get_model_serving()\n",
- "deployment = ms.get_deployment(\"fraud2\")\n",
- "fraud_model = mr.get_model(\"fraud2\", version=1)\n",
- "print(fraud_model.input_example)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "QKCTKfcaimxo"
- },
- "source": [
- "## 🚀 Send Inference Requests\n",
- "\n",
- "Finally you can start making predictions with your model! \n",
- "\n",
- "Send inference requests to the deployed model as follows:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 52
- },
- "id": "aL3-2W39tC-u",
- "outputId": "fbda67a5-ce89-49ff-f113-a7dc8bbc2b6d"
- },
- "outputs": [],
- "source": [
- "deployment.predict({\"instances\":[fraud_model.input_example]})"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "HmwNUVbHjO5I"
- },
- "source": [
- "## 🥳 Next Steps\n",
- "\n",
- "Congratulations you've now completed the quickstart example for Managed Hopsworks.\n",
- "\n",
- "\n",
- "Check out our other tutorials on ➡ https://github.com/logicalclocks/hopsworks-tutorials\n",
- "\n",
- "Or documentation at ➡ https://docs.hopsworks.ai"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "name": "quickstart.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "192f266700894002b24eeff9b2136db3": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "1b7c2a4d212646239113f831eeafc1cd": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "24536f939afa41f1acff4e55fae4423c": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "3f7e59807b824e86b47319bd47e32650": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_66c0d3f6020845d7a4203f33abf33818",
- "IPY_MODEL_f22754bd6225452f8253ecff644c31d5",
- "IPY_MODEL_77e516d3f3ce4c5488bf68825d260061"
- ],
- "layout": "IPY_MODEL_f91eeaa7449541819c9970f42963e2dd"
- }
- },
- "43c0b243b2c9417abc5072f82ba22457": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_4e326043046247f5ae5bfac3306cb145",
- "IPY_MODEL_a16b8eae9f614a44a9ee7f7ea900d1f2",
- "IPY_MODEL_a0e314c90c5a413d9701a6efcded884b"
- ],
- "layout": "IPY_MODEL_24536f939afa41f1acff4e55fae4423c"
- }
- },
- "4559fa2eaf7348ae9dd5d1cfdd3e0bd5": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "4e326043046247f5ae5bfac3306cb145": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_a26b3a1aa96f4eeaaad89d4d662fa010",
- "placeholder": "",
- "style": "IPY_MODEL_ed9e0fc80c9e483cb7aafad007b57ea0",
- "value": "Deployment is running: 100%"
- }
- },
- "5fd0bd75549a47ae8a3042f1e0c61ba5": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "66c0d3f6020845d7a4203f33abf33818": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_d1c010a576d94f8bbcc6f535741f514b",
- "placeholder": "",
- "style": "IPY_MODEL_5fd0bd75549a47ae8a3042f1e0c61ba5",
- "value": "Model export complete: 100%"
- }
- },
- "71ad3ff88d62426abda44fdb300b0484": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "7529396c14464b7b9e349c6ba003cddc": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "77e516d3f3ce4c5488bf68825d260061": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_192f266700894002b24eeff9b2136db3",
- "placeholder": "",
- "style": "IPY_MODEL_1b7c2a4d212646239113f831eeafc1cd",
- "value": " 6/6 [00:25<00:00, 5.19s/it]"
- }
- },
- "7887571c2ea4415080f60ea18be441b1": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "a0e314c90c5a413d9701a6efcded884b": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_7529396c14464b7b9e349c6ba003cddc",
- "placeholder": "",
- "style": "IPY_MODEL_71ad3ff88d62426abda44fdb300b0484",
- "value": " 1/1 [00:20<00:00, 5.12s/it]"
- }
- },
- "a16b8eae9f614a44a9ee7f7ea900d1f2": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_7887571c2ea4415080f60ea18be441b1",
- "max": 1,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_4559fa2eaf7348ae9dd5d1cfdd3e0bd5",
- "value": 1
- }
- },
- "a26b3a1aa96f4eeaaad89d4d662fa010": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "d1c010a576d94f8bbcc6f535741f514b": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "d6462acb3ac6479f942e1a1b8da309a8": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "ed9e0fc80c9e483cb7aafad007b57ea0": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "f22754bd6225452f8253ecff644c31d5": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_f2ccbe863c224765bbe755fcb0ba4be7",
- "max": 6,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_d6462acb3ac6479f942e1a1b8da309a8",
- "value": 6
- }
- },
- "f2ccbe863c224765bbe755fcb0ba4be7": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "f91eeaa7449541819c9970f42963e2dd": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- }
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/docs/tutorials/fraud_batch/1_feature_groups.ipynb b/docs/tutorials/fraud_batch/1_feature_groups.ipynb
deleted file mode 100755
index 910fe1a9b..000000000
--- a/docs/tutorials/fraud_batch/1_feature_groups.ipynb
+++ /dev/null
@@ -1,935 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "bd1b0965",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "eb87e56e",
- "metadata": {},
- "source": [
- "# Part 01: Load, Engineer & Connect\n",
- "\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_batch/1_feature_groups.ipynb)\n",
- "\n",
- "This is the first part of the quick start series of tutorials about Hopsworks Feature Store. As part of this first module, we will work with data related to credit card transactions. \n",
- "The objective of this tutorial is to demonstrate how to work with the **Hopworks Feature Store** for batch data with a goal of training and deploying a model that can predict fraudulent transactions.\n",
- "\n",
- "\n",
- "## 🗒️ This notebook is divided in 3 sections:\n",
- "1. Loading the data and feature engineeing,\n",
- "2. Connect to the Hopsworks feature store,\n",
- "3. Create feature groups and upload them to the feature store.\n",
- "\n",
- "\n",
- "\n",
- "First of all we will load the data and do some feature engineering on it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "cf0c2b64",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001B[33mWARNING: You are using pip version 22.0.3; however, version 22.1.2 is available.\n",
- "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\u001B[33m\n",
- "\u001B[0m"
- ]
- }
- ],
- "source": [
- "!pip install -U hopsworks==3.0.0rc5 --quiet"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fadfb659",
- "metadata": {},
- "source": [
- "## 💽 Loading the Data \n",
- "\n",
- "The data we will use comes from three different CSV files:\n",
- "\n",
- "- `credit_cards.csv`: credit card information such as expiration date and provider.\n",
- "- `transactions.csv`: transaction information such as timestamp, location, and the amount. Importantly, the binary `fraud_label` variable tells us whether a transaction was fraudulent or not.\n",
- "- `profiles.csv`: credit card user information such as birthdate and city of residence.\n",
- "\n",
- "We can conceptualize these CSV files as originating from separate data sources.\n",
- "**All three files have a credit card number column `cc_num` in common, which we can use for joins.**\n",
- "\n",
- "Let's go ahead and load the data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "68ce9ada",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cc_num | \n",
- " provider | \n",
- " expires | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 4796807885357879 | \n",
- " visa | \n",
- " 05/23 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4529266636192966 | \n",
- " visa | \n",
- " 03/22 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4922690008243953 | \n",
- " visa | \n",
- " 02/27 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " cc_num provider expires\n",
- "0 4796807885357879 visa 05/23\n",
- "1 4529266636192966 visa 03/22\n",
- "2 4922690008243953 visa 02/27"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "credit_cards_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv\")\n",
- "credit_cards_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "0e78d1f6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name | \n",
- " sex | \n",
- " mail | \n",
- " birthdate | \n",
- " City | \n",
- " Country | \n",
- " cc_num | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Catherine Zimmerman | \n",
- " F | \n",
- " valenciajason@hotmail.com | \n",
- " 1988-09-20 | \n",
- " Bryn Mawr-Skyway | \n",
- " US | \n",
- " 4796807885357879 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Michael Williams | \n",
- " M | \n",
- " brettkennedy@yahoo.com | \n",
- " 1977-03-01 | \n",
- " Gates-North Gates | \n",
- " US | \n",
- " 4529266636192966 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Jessica Krueger | \n",
- " F | \n",
- " marthacruz@hotmail.com | \n",
- " 1947-09-10 | \n",
- " Greenfield | \n",
- " US | \n",
- " 4922690008243953 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name sex mail birthdate \\\n",
- "0 Catherine Zimmerman F valenciajason@hotmail.com 1988-09-20 \n",
- "1 Michael Williams M brettkennedy@yahoo.com 1977-03-01 \n",
- "2 Jessica Krueger F marthacruz@hotmail.com 1947-09-10 \n",
- "\n",
- " City Country cc_num \n",
- "0 Bryn Mawr-Skyway US 4796807885357879 \n",
- "1 Gates-North Gates US 4529266636192966 \n",
- "2 Greenfield US 4922690008243953 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "profiles_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/profiles.csv\", parse_dates=[\"birthdate\"])\n",
- "profiles_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "b5774d94",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " tid | \n",
- " datetime | \n",
- " cc_num | \n",
- " category | \n",
- " amount | \n",
- " latitude | \n",
- " longitude | \n",
- " city | \n",
- " country | \n",
- " fraud_label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 11df919988c134d97bbff2678eb68e22 | \n",
- " 2022-01-01 00:00:24 | \n",
- " 4473593503484549 | \n",
- " Health/Beauty | \n",
- " 62.95 | \n",
- " 42.30865 | \n",
- " -83.48216 | \n",
- " Canton | \n",
- " US | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " dd0b2d6d4266ccd3bf05bc2ea91cf180 | \n",
- " 2022-01-01 00:00:56 | \n",
- " 4272465718946864 | \n",
- " Grocery | \n",
- " 85.45 | \n",
- " 33.52253 | \n",
- " -117.70755 | \n",
- " Laguna Niguel | \n",
- " US | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " e627f5d9a9739833bd52d2da51761fc3 | \n",
- " 2022-01-01 00:02:32 | \n",
- " 4104216579248948 | \n",
- " Domestic Transport | \n",
- " 21.63 | \n",
- " 37.60876 | \n",
- " -77.37331 | \n",
- " Mechanicsville | \n",
- " US | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " tid datetime cc_num \\\n",
- "0 11df919988c134d97bbff2678eb68e22 2022-01-01 00:00:24 4473593503484549 \n",
- "1 dd0b2d6d4266ccd3bf05bc2ea91cf180 2022-01-01 00:00:56 4272465718946864 \n",
- "2 e627f5d9a9739833bd52d2da51761fc3 2022-01-01 00:02:32 4104216579248948 \n",
- "\n",
- " category amount latitude longitude city country \\\n",
- "0 Health/Beauty 62.95 42.30865 -83.48216 Canton US \n",
- "1 Grocery 85.45 33.52253 -117.70755 Laguna Niguel US \n",
- "2 Domestic Transport 21.63 37.60876 -77.37331 Mechanicsville US \n",
- "\n",
- " fraud_label \n",
- "0 0 \n",
- "1 0 \n",
- "2 0 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trans_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv\", parse_dates=[\"datetime\"])\n",
- "trans_df.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "44ddf58f",
- "metadata": {},
- "source": [
- "## 🛠️ Feature Engineering \n",
- "\n",
- "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning we will create additional features based on these patterns. In particular, we will create two types of features:\n",
- "1. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles.csv` with the `datetime` feature from `transactions.csv`.\n",
- "2. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.\n",
- "\n",
- "Let's start with the first category."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "6461d774",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " age_at_transaction | \n",
- " days_until_card_expires | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 97.513297 | \n",
- " 1460.999722 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 33.752919 | \n",
- " 1733.999352 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 80.899681 | \n",
- " 242.998241 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 53.526088 | \n",
- " 150.997639 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 46.005059 | \n",
- " 515.997280 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " age_at_transaction days_until_card_expires\n",
- "0 97.513297 1460.999722\n",
- "1 33.752919 1733.999352\n",
- "2 80.899681 242.998241\n",
- "3 53.526088 150.997639\n",
- "4 46.005059 515.997280"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import numpy as np\n",
- "\n",
- "# Compute age at transaction.\n",
- "age_df = trans_df.merge(profiles_df, on=\"cc_num\", how=\"left\")\n",
- "trans_df[\"age_at_transaction\"] = (age_df[\"datetime\"] - age_df[\"birthdate\"]) / np.timedelta64(1, \"Y\")\n",
- "\n",
- "# Compute days until card expires.\n",
- "card_expiry_df = trans_df.merge(credit_cards_df, on=\"cc_num\", how=\"left\")\n",
- "card_expiry_df[\"expires\"] = pd.to_datetime(card_expiry_df[\"expires\"], format=\"%m/%y\")\n",
- "trans_df[\"days_until_card_expires\"] = (card_expiry_df[\"expires\"] - card_expiry_df[\"datetime\"]) / np.timedelta64(1, \"D\")\n",
- "\n",
- "trans_df[[\"age_at_transaction\", \"days_until_card_expires\"]].head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e4fbbd63",
- "metadata": {},
- "source": [
- "Next, we create features that for each credit card aggregate data from multiple time steps.\n",
- "\n",
- "We start by computing the distance between consecutive transactions, which we will call `loc_delta`.\n",
- "Here we use the [Haversine distance](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html?highlight=haversine#sklearn.metrics.pairwise.haversine_distances) to quantify the distance between two longitude and latitude coordinates."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "03660624",
- "metadata": {},
- "outputs": [],
- "source": [
- "from math import radians\n",
- "\n",
- "# Do some simple preprocessing.\n",
- "trans_df.sort_values(\"datetime\", inplace=True)\n",
- "trans_df[[\"longitude\", \"latitude\"]] = trans_df[[\"longitude\", \"latitude\"]].applymap(radians)\n",
- "\n",
- "def haversine(long, lat):\n",
- " \"\"\"Compute Haversine distance between each consecutive coordinate in (long, lat).\"\"\"\n",
- "\n",
- " long_shifted = long.shift()\n",
- " lat_shifted = lat.shift()\n",
- " long_diff = long_shifted - long\n",
- " lat_diff = lat_shifted - lat\n",
- "\n",
- " a = np.sin(lat_diff/2.0)**2\n",
- " b = np.cos(lat) * np.cos(lat_shifted) * np.sin(long_diff/2.0)**2\n",
- " c = 2*np.arcsin(np.sqrt(a + b))\n",
- "\n",
- " return c\n",
- "\n",
- "\n",
- "trans_df[\"loc_delta\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : haversine(x[\"longitude\"], x[\"latitude\"]))\\\n",
- " .reset_index(level=0, drop=True)\\\n",
- " .fillna(0)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "62121667",
- "metadata": {},
- "source": [
- "Next we compute windowed aggregates. Here we will use 4-hour windows, but feel free to experiment with different window lengths by setting `window_len` below to a value of your choice."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "aab205e9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " trans_volume_mstd | \n",
- " trans_volume_mavg | \n",
- " trans_freq | \n",
- " loc_delta_mavg | \n",
- " cc_num | \n",
- " datetime | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 106015 | \n",
- " 73.08 | \n",
- " 73.08 | \n",
- " 73.08 | \n",
- " 0.045635 | \n",
- " 4032019521897961 | \n",
- " 2022-03-24 10:57:02 | \n",
- "
\n",
- " \n",
- " 106016 | \n",
- " 287.33 | \n",
- " 287.33 | \n",
- " 287.33 | \n",
- " 0.045846 | \n",
- " 4032019521897961 | \n",
- " 2022-03-28 11:57:02 | \n",
- "
\n",
- " \n",
- " 106017 | \n",
- " 53.88 | \n",
- " 53.88 | \n",
- " 53.88 | \n",
- " 0.000120 | \n",
- " 4032019521897961 | \n",
- " 2022-04-01 12:57:02 | \n",
- "
\n",
- " \n",
- " 106018 | \n",
- " 279.73 | \n",
- " 279.73 | \n",
- " 279.73 | \n",
- " 0.045928 | \n",
- " 4032019521897961 | \n",
- " 2022-04-05 13:57:02 | \n",
- "
\n",
- " \n",
- " 106019 | \n",
- " 73.66 | \n",
- " 73.66 | \n",
- " 73.66 | \n",
- " 0.045974 | \n",
- " 4032019521897961 | \n",
- " 2022-04-09 14:57:02 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " trans_volume_mstd trans_volume_mavg trans_freq loc_delta_mavg \\\n",
- "106015 73.08 73.08 73.08 0.045635 \n",
- "106016 287.33 287.33 287.33 0.045846 \n",
- "106017 53.88 53.88 53.88 0.000120 \n",
- "106018 279.73 279.73 279.73 0.045928 \n",
- "106019 73.66 73.66 73.66 0.045974 \n",
- "\n",
- " cc_num datetime \n",
- "106015 4032019521897961 2022-03-24 10:57:02 \n",
- "106016 4032019521897961 2022-03-28 11:57:02 \n",
- "106017 4032019521897961 2022-04-01 12:57:02 \n",
- "106018 4032019521897961 2022-04-05 13:57:02 \n",
- "106019 4032019521897961 2022-04-09 14:57:02 "
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "window_len = \"4h\"\n",
- "cc_group = trans_df[[\"cc_num\", \"amount\", \"datetime\"]].groupby(\"cc_num\").rolling(window_len, on=\"datetime\")\n",
- "\n",
- "# Moving average of transaction volume.\n",
- "df_4h_mavg = pd.DataFrame(cc_group.mean())\n",
- "df_4h_mavg.columns = [\"trans_volume_mavg\", \"datetime\"]\n",
- "df_4h_mavg = df_4h_mavg.reset_index(level=[\"cc_num\"])\n",
- "df_4h_mavg = df_4h_mavg.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_mavg = df_4h_mavg.sort_index()\n",
- "\n",
- "# Moving standard deviation of transaction volume.\n",
- "df_4h_std = pd.DataFrame(cc_group.mean())\n",
- "df_4h_std.columns = [\"trans_volume_mstd\", \"datetime\"]\n",
- "df_4h_std = df_4h_std.reset_index(level=[\"cc_num\"])\n",
- "df_4h_std = df_4h_std.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_std = df_4h_std.fillna(0)\n",
- "df_4h_std = df_4h_std.sort_index()\n",
- "window_aggs_df = df_4h_std.merge(df_4h_mavg,left_index=True, right_index=True)\n",
- "\n",
- "# Moving average of transaction frequency.\n",
- "df_4h_count = pd.DataFrame(cc_group.mean())\n",
- "df_4h_count.columns = [\"trans_freq\", \"datetime\"]\n",
- "df_4h_count = df_4h_count.reset_index(level=[\"cc_num\"])\n",
- "df_4h_count = df_4h_count.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_count = df_4h_count.sort_index()\n",
- "window_aggs_df = window_aggs_df.merge(df_4h_count,left_index=True, right_index=True)\n",
- "\n",
- "# Moving average of location difference between consecutive transactions.\n",
- "cc_group = trans_df[[\"cc_num\", \"loc_delta\", \"datetime\"]].groupby(\"cc_num\").rolling(window_len, on=\"datetime\").mean()\n",
- "df_4h_loc_delta_mavg = pd.DataFrame(cc_group)\n",
- "df_4h_loc_delta_mavg.columns = [\"loc_delta_mavg\", \"datetime\"]\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.reset_index(level=[\"cc_num\"])\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.drop(columns=[\"cc_num\", \"datetime\"])\n",
- "df_4h_loc_delta_mavg = df_4h_loc_delta_mavg.sort_index()\n",
- "window_aggs_df = window_aggs_df.merge(df_4h_loc_delta_mavg,left_index=True, right_index=True)\n",
- "\n",
- "window_aggs_df = window_aggs_df.merge(trans_df[[\"cc_num\", \"datetime\"]].sort_index(),left_index=True, right_index=True)\n",
- "window_aggs_df.tail()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2e25f1ea",
- "metadata": {},
- "source": [
- "### Convert date time object to unix epoch in milliseconds"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "ab0048f5",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df.datetime = trans_df.datetime.values.astype(np.int64) // 10 ** 6\n",
- "window_aggs_df.datetime = window_aggs_df.datetime.values.astype(np.int64) // 10 ** 6"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8d81124a",
- "metadata": {},
- "source": [
- "## 🪄 Creating Feature Groups \n",
- "\n",
- "A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features. In our case, we will create a feature group for the transaction data and a feature group for the windowed aggregations on the transaction data. Both will have `cc_num` as primary key, which will allow us to join them when creating a dataset in the next tutorial.\n",
- "\n",
- "Feature groups can also be used to define a namespace for features. For instance, in a real-life setting we would likely want to experiment with different window lengths. In that case, we can create feature groups with identical schema for each window length. \n",
- "\n",
- "Before we can create a feature group we need to connect to our feature store."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "472dc254",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Connected. Call `.close()` to terminate connection gracefully.\n"
- ]
- }
- ],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "80ca38a4",
- "metadata": {},
- "source": [
- "To create a feature group we need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "ce9ea36d",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_fg = fs.get_or_create_feature_group(\n",
- " name=\"transactions\",\n",
- " version=1,\n",
- " description=\"Transaction data\",\n",
- " primary_key=['cc_num'],\n",
- " event_time=['datetime']\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "450f94e9",
- "metadata": {},
- "source": [
- "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n",
- "\n",
- "At this point, we have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent we populate it with its associated data using the `insert` function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "c23ade7b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Feature Group created successfully, explore it at https://c.app.hopsworks.ai:443/p/124/fs/72/fg/72\n",
- "Launching offline feature group backfill job...\n",
- "Backfill Job started successfully, you can follow the progress at https://c.app.hopsworks.ai/p/124/jobs/named/transactions_1_offline_fg_backfill/executions\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "(, None)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trans_fg.insert(trans_df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "2e00a3f4-7749-46af-bd01-78e5126f12ae",
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_descriptions = [\n",
- " {\"name\": \"tid\", \"description\": \"Transaction id\"},\n",
- " {\"name\": \"datetime\", \"description\": \"Transaction time\"},\n",
- " {\"name\": \"cc_num\", \"description\": \"Number of the credit card performing the transaction\"},\n",
- " {\"name\": \"category\", \"description\": \"Expense category\"},\n",
- " {\"name\": \"amount\", \"description\": \"Dollar amount of the transaction\"},\n",
- " {\"name\": \"latitude\", \"description\": \"Transaction location latitude\"},\n",
- " {\"name\": \"longitude\", \"description\": \"Transaction location longitude\"},\n",
- " {\"name\": \"city\", \"description\": \"City in which the transaction was made\"},\n",
- " {\"name\": \"country\", \"description\": \"Country in which the transaction was made\"},\n",
- " {\"name\": \"fraud_label\", \"description\": \"Whether the transaction was fraudulent or not\"},\n",
- " {\"name\": \"age_at_transaction\", \"description\": \"Age of the card holder when the transaction was made\"},\n",
- " {\"name\": \"days_until_card_expires\", \"description\": \"Card validity days left when the transaction was made\"},\n",
- " {\"name\": \"loc_delta\", \"description\": \"Haversine distance between this transaction location and the previous transaction location from the same card\"},\n",
- "]\n",
- "\n",
- "for desc in feature_descriptions: \n",
- " trans_fg.update_feature_description(desc[\"name\"], desc[\"description\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c0aed1e7",
- "metadata": {},
- "source": [
- "At the creation of the feature group, you will prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group.\n",
- "\n",
- "[//]: <> (insert GIF here)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f7bdca21",
- "metadata": {},
- "source": [
- "We can move on and do the same thing for the feature group with our windows aggregation."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "7b11a34d",
- "metadata": {},
- "outputs": [],
- "source": [
- "window_aggs_fg = fs.get_or_create_feature_group(\n",
- " name=f\"transactions_{window_len}_aggs\",\n",
- " version=1,\n",
- " description=f\"Aggregate transaction data over {window_len} windows.\",\n",
- " primary_key=['cc_num'],\n",
- " event_time=['datetime']\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "9276c51c",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Feature Group created successfully, explore it at https://c.app.hopsworks.ai:443/p/124/fs/72/fg/73\n",
- "Launching offline feature group backfill job...\n",
- "Backfill Job started successfully, you can follow the progress at https://c.app.hopsworks.ai/p/124/jobs/named/transactions_4h_aggs_1_offline_fg_backfill/executions\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "(, None)"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "window_aggs_fg.insert(window_aggs_df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "532e7301-935d-4cdd-95ea-2c68a50c80d5",
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_descriptions = [\n",
- " {\"name\": \"datetime\", \"description\": \"Transaction time\"},\n",
- " {\"name\": \"cc_num\", \"description\": \"Number of the credit card performing the transaction\"},\n",
- " {\"name\": \"loc_delta_mavg\", \"description\": \"Moving average of location difference between consecutive transactions from the same card\"},\n",
- " {\"name\": \"trans_freq\", \"description\": \"Moving average of transaction frequency from the same card\"},\n",
- " {\"name\": \"trans_volume_mavg\", \"description\": \"Moving average of transaction volume from the same card\"},\n",
- " {\"name\": \"trans_volume_mstd\", \"description\": \"Moving standard deviation of transaction volume from the same card\"},\n",
- "]\n",
- "\n",
- "for desc in feature_descriptions: \n",
- " window_aggs_fg.update_feature_description(desc[\"name\"], desc[\"description\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a704dcc9",
- "metadata": {},
- "source": [
- "Both feature groups are now accessible and searchable in the UI\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ccdab7eb",
- "metadata": {},
- "source": [
- "## ⏭️ **Next:** Part 02 \n",
- "\n",
- "In the following notebook we will use our feature groups to create a dataset we can train a model on."
- ]
- }
- ],
- "metadata": {
- "interpreter": {
- "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83"
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/docs/tutorials/fraud_batch/2_feature_view_creation.ipynb b/docs/tutorials/fraud_batch/2_feature_view_creation.ipynb
deleted file mode 100755
index e165bcee3..000000000
--- a/docs/tutorials/fraud_batch/2_feature_view_creation.ipynb
+++ /dev/null
@@ -1,879 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Part 02: Training Data & Feature views\n",
- "\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_batch/2_feature_view_creation.ipynb)\n",
- "\n",
- "This is the second part of the quick start series of tutorials about Hopsworks Feature Store. This notebook explains how to read from a feature group and create training dataset within the feature store\n",
- "\n",
- "## 🗒️ In this notebook we will see how to create a training dataset from the feature groups:\n",
- "1. **Select the features** we want to train our model on,\n",
- "2. **How the features should be preprocessed,**\n",
- "3. **Create a dataset split** for training and validation data.\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Connected. Call `.close()` to terminate connection gracefully.\n",
- "\n",
- "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/124\n",
- "Connected. Call `.close()` to terminate connection gracefully.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n"
- ]
- }
- ],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 🔪 Feature Selection \n",
- "\n",
- "We start by selecting all the features we want to include for model training/inference."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load feature groups.\n",
- "trans_fg = fs.get_feature_group('transactions', version=1)\n",
- "window_aggs_fg = fs.get_feature_group('transactions_4h_aggs', version=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-06-20 09:38:17,044 INFO: USE `robin100_featurestore`\n",
- "2022-06-20 09:38:18,287 INFO: WITH right_fg0 AS (SELECT *\n",
- "FROM (SELECT `fg1`.`fraud_label` `fraud_label`, `fg1`.`category` `category`, `fg1`.`amount` `amount`, `fg1`.`age_at_transaction` `age_at_transaction`, `fg1`.`days_until_card_expires` `days_until_card_expires`, `fg1`.`loc_delta` `loc_delta`, `fg1`.`cc_num` `join_pk_cc_num`, `fg1`.`datetime` `join_evt_datetime`, `fg0`.`trans_volume_mstd` `trans_volume_mstd`, `fg0`.`trans_volume_mavg` `trans_volume_mavg`, `fg0`.`trans_freq` `trans_freq`, `fg0`.`loc_delta_mavg` `loc_delta_mavg`, RANK() OVER (PARTITION BY `fg0`.`cc_num`, `fg1`.`datetime` ORDER BY `fg0`.`datetime` DESC) pit_rank_hopsworks\n",
- "FROM `robin100_featurestore`.`transactions_1` `fg1`\n",
- "INNER JOIN `robin100_featurestore`.`transactions_4h_aggs_1` `fg0` ON `fg1`.`cc_num` = `fg0`.`cc_num` AND `fg1`.`datetime` >= `fg0`.`datetime`) NA\n",
- "WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`fraud_label` `fraud_label`, `right_fg0`.`category` `category`, `right_fg0`.`amount` `amount`, `right_fg0`.`age_at_transaction` `age_at_transaction`, `right_fg0`.`days_until_card_expires` `days_until_card_expires`, `right_fg0`.`loc_delta` `loc_delta`, `right_fg0`.`trans_volume_mstd` `trans_volume_mstd`, `right_fg0`.`trans_volume_mavg` `trans_volume_mavg`, `right_fg0`.`trans_freq` `trans_freq`, `right_fg0`.`loc_delta_mavg` `loc_delta_mavg`\n",
- "FROM right_fg0)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " fraud_label | \n",
- " category | \n",
- " amount | \n",
- " age_at_transaction | \n",
- " days_until_card_expires | \n",
- " loc_delta | \n",
- " trans_volume_mstd | \n",
- " trans_volume_mavg | \n",
- " trans_freq | \n",
- " loc_delta_mavg | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " Grocery | \n",
- " 93.51 | \n",
- " 25.334094 | \n",
- " 175.912280 | \n",
- " 0.000000 | \n",
- " 93.510 | \n",
- " 93.510 | \n",
- " 93.510 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0 | \n",
- " Domestic Transport | \n",
- " 65.14 | \n",
- " 25.335632 | \n",
- " 175.350486 | \n",
- " 0.319574 | \n",
- " 65.140 | \n",
- " 65.140 | \n",
- " 65.140 | \n",
- " 0.319574 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0 | \n",
- " Grocery | \n",
- " 0.26 | \n",
- " 25.336235 | \n",
- " 175.130347 | \n",
- " 0.314148 | \n",
- " 0.260 | \n",
- " 0.260 | \n",
- " 0.260 | \n",
- " 0.314148 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0 | \n",
- " Grocery | \n",
- " 1.43 | \n",
- " 25.336660 | \n",
- " 174.975058 | \n",
- " 0.000000 | \n",
- " 0.845 | \n",
- " 0.845 | \n",
- " 0.845 | \n",
- " 0.157074 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0 | \n",
- " Grocery | \n",
- " 19.75 | \n",
- " 25.344710 | \n",
- " 172.034664 | \n",
- " 0.105313 | \n",
- " 19.750 | \n",
- " 19.750 | \n",
- " 19.750 | \n",
- " 0.105313 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " fraud_label category amount age_at_transaction \\\n",
- "0 0 Grocery 93.51 25.334094 \n",
- "1 0 Domestic Transport 65.14 25.335632 \n",
- "2 0 Grocery 0.26 25.336235 \n",
- "3 0 Grocery 1.43 25.336660 \n",
- "4 0 Grocery 19.75 25.344710 \n",
- "\n",
- " days_until_card_expires loc_delta trans_volume_mstd trans_volume_mavg \\\n",
- "0 175.912280 0.000000 93.510 93.510 \n",
- "1 175.350486 0.319574 65.140 65.140 \n",
- "2 175.130347 0.314148 0.260 0.260 \n",
- "3 174.975058 0.000000 0.845 0.845 \n",
- "4 172.034664 0.105313 19.750 19.750 \n",
- "\n",
- " trans_freq loc_delta_mavg \n",
- "0 93.510 0.000000 \n",
- "1 65.140 0.319574 \n",
- "2 0.260 0.314148 \n",
- "3 0.845 0.157074 \n",
- "4 19.750 0.105313 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Select features for training data.\n",
- "ds_query = trans_fg.select([\"fraud_label\", \"category\", \"amount\", \"age_at_transaction\", \"days_until_card_expires\", \"loc_delta\"])\\\n",
- " .join(window_aggs_fg.select_except([\"cc_num\"]))\n",
- "\n",
- "ds_query.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Recall that we computed the features in `transactions_4h_aggs` using 4-hour aggregates. If we had created multiple feature groups with identical schema for different window lengths, and wanted to include them in the join we would need to include a prefix argument in the join to avoid feature name clash. See the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/query_api/#join) for more details."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "🤖 Transformation Functions \n",
- "\n",
- "We will preprocess our data using *min-max scaling* on numerical features and *label encoding* on categorical features. To do this we simply define a mapping between our features and transformation functions. This ensures that transformation functions such as *min-max scaling* are fitted only on the training data (and not the validation/test data), which ensures that there is no data leakage."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load transformation functions.\n",
- "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n",
- "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
- "\n",
- "# Map features to transformations.\n",
- "transformation_functions = {\n",
- " \"category\": label_encoder,\n",
- " \"amount\": min_max_scaler,\n",
- " \"trans_volume_mavg\": min_max_scaler,\n",
- " \"trans_volume_mstd\": min_max_scaler,\n",
- " \"trans_freq\": min_max_scaler,\n",
- " \"loc_delta\": min_max_scaler,\n",
- " \"loc_delta_mavg\": min_max_scaler,\n",
- " \"age_at_transaction\": min_max_scaler,\n",
- " \"days_until_card_expires\": min_max_scaler,\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ⚙️ Feature View Creation \n",
- "\n",
- "The Feature Views allows schema in form of a query with filters, define a model target feature/label and additional transformation functions.\n",
- "In order to create a Feature View we may use `fs.create_feature_view()`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Feature view created successfully, explore it at https://c.app.hopsworks.ai:443/p/124/fs/72/fv/transactions_view/version/1\n"
- ]
- }
- ],
- "source": [
- "feature_view = fs.create_feature_view(\n",
- " name='transactions_view',\n",
- " query=ds_query,\n",
- " labels=[\"fraud_label\"],\n",
- " transformation_functions=transformation_functions\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "## 🏋️ Training Dataset Creation\n",
- "\n",
- "In Hopsworks training data is a query where the projection (set of features) is determined by the parent FeatureView with an optional snapshot on disk of the data returned by the query.\n",
- "\n",
- "**Training Dataset may contain splits such as:** \n",
- "* Training set - the subset of training data used to train a model.\n",
- "* Validation set - the subset of training data used to evaluate hparams when training a model\n",
- "* Test set - the holdout subset of training data used to evaluate a mode\n",
- "\n",
- "Training dataset is created using `fs.create_train_validation_test_split()` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Training dataset job started successfully, you can follow the progress at https://c.app.hopsworks.ai/p/124/jobs/named/transactions_view_1_1_create_fv_td_20062022073937/executions\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "VersionWarning: Incremented version to `1`.\n"
- ]
- }
- ],
- "source": [
- "td_version, td_job = feature_view.create_train_validation_test_split(\n",
- " description = 'transactions_dataset',\n",
- " data_format = 'csv',\n",
- " validation_size = 0.2,\n",
- " test_size = 0.1,\n",
- " write_options = {'wait_for_job': True},\n",
- " coalesce = True,\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 🪝 Training Dataset retreival \n",
- "\n",
- "To retrieve training data from storage (already materialised) or from feature groups direcly we can use `get_training_dataset` or `get_train_validation_test_split` methods. You need to provide the version of the training dataset you want to retrieve."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train, y_train, X_val, y_val, X_test, y_test = feature_view.get_train_validation_test_split(td_version)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " category | \n",
- " amount | \n",
- " age_at_transaction | \n",
- " days_until_card_expires | \n",
- " loc_delta | \n",
- " trans_volume_mstd | \n",
- " trans_volume_mavg | \n",
- " trans_freq | \n",
- " loc_delta_mavg | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 0.000000 | \n",
- " 0.010858 | \n",
- " 0.850452 | \n",
- " 0.024955 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.027577 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0 | \n",
- " 0.000000 | \n",
- " 0.047378 | \n",
- " 0.943722 | \n",
- " 0.035718 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.039471 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0 | \n",
- " 0.000000 | \n",
- " 0.063759 | \n",
- " 0.132026 | \n",
- " 0.000044 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000048 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0 | \n",
- " 0.000000 | \n",
- " 0.340603 | \n",
- " 0.208466 | \n",
- " 0.211902 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.234169 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0 | \n",
- " 0.000000 | \n",
- " 0.954661 | \n",
- " 0.874834 | \n",
- " 0.183911 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.203237 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 74206 | \n",
- " 5 | \n",
- " 0.005703 | \n",
- " 0.206613 | \n",
- " 0.228933 | \n",
- " 0.098071 | \n",
- " 0.002156 | \n",
- " 0.002156 | \n",
- " 0.002156 | \n",
- " 0.103897 | \n",
- "
\n",
- " \n",
- " 74207 | \n",
- " 5 | \n",
- " 0.007304 | \n",
- " 0.909331 | \n",
- " 0.728989 | \n",
- " 0.214688 | \n",
- " 0.004702 | \n",
- " 0.004702 | \n",
- " 0.004702 | \n",
- " 0.210192 | \n",
- "
\n",
- " \n",
- " 74208 | \n",
- " 5 | \n",
- " 0.013629 | \n",
- " 0.516288 | \n",
- " 0.314443 | \n",
- " 0.086154 | \n",
- " 0.005832 | \n",
- " 0.005832 | \n",
- " 0.005832 | \n",
- " 0.102090 | \n",
- "
\n",
- " \n",
- " 74209 | \n",
- " 8 | \n",
- " 0.000170 | \n",
- " 0.132164 | \n",
- " 0.816431 | \n",
- " 0.217638 | \n",
- " 0.000170 | \n",
- " 0.000170 | \n",
- " 0.000170 | \n",
- " 0.240507 | \n",
- "
\n",
- " \n",
- " 74210 | \n",
- " 8 | \n",
- " 0.000488 | \n",
- " 0.488983 | \n",
- " 0.607004 | \n",
- " 0.043470 | \n",
- " 0.000488 | \n",
- " 0.000488 | \n",
- " 0.000488 | \n",
- " 0.048038 | \n",
- "
\n",
- " \n",
- "
\n",
- "
74211 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " category amount age_at_transaction days_until_card_expires \\\n",
- "0 0 0.000000 0.010858 0.850452 \n",
- "1 0 0.000000 0.047378 0.943722 \n",
- "2 0 0.000000 0.063759 0.132026 \n",
- "3 0 0.000000 0.340603 0.208466 \n",
- "4 0 0.000000 0.954661 0.874834 \n",
- "... ... ... ... ... \n",
- "74206 5 0.005703 0.206613 0.228933 \n",
- "74207 5 0.007304 0.909331 0.728989 \n",
- "74208 5 0.013629 0.516288 0.314443 \n",
- "74209 8 0.000170 0.132164 0.816431 \n",
- "74210 8 0.000488 0.488983 0.607004 \n",
- "\n",
- " loc_delta trans_volume_mstd trans_volume_mavg trans_freq \\\n",
- "0 0.024955 0.000000 0.000000 0.000000 \n",
- "1 0.035718 0.000000 0.000000 0.000000 \n",
- "2 0.000044 0.000000 0.000000 0.000000 \n",
- "3 0.211902 0.000000 0.000000 0.000000 \n",
- "4 0.183911 0.000000 0.000000 0.000000 \n",
- "... ... ... ... ... \n",
- "74206 0.098071 0.002156 0.002156 0.002156 \n",
- "74207 0.214688 0.004702 0.004702 0.004702 \n",
- "74208 0.086154 0.005832 0.005832 0.005832 \n",
- "74209 0.217638 0.000170 0.000170 0.000170 \n",
- "74210 0.043470 0.000488 0.000488 0.000488 \n",
- "\n",
- " loc_delta_mavg \n",
- "0 0.027577 \n",
- "1 0.039471 \n",
- "2 0.000048 \n",
- "3 0.234169 \n",
- "4 0.203237 \n",
- "... ... \n",
- "74206 0.103897 \n",
- "74207 0.210192 \n",
- "74208 0.102090 \n",
- "74209 0.240507 \n",
- "74210 0.048038 \n",
- "\n",
- "[74211 rows x 9 columns]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " category | \n",
- " amount | \n",
- " age_at_transaction | \n",
- " days_until_card_expires | \n",
- " loc_delta | \n",
- " trans_volume_mstd | \n",
- " trans_volume_mavg | \n",
- " trans_freq | \n",
- " loc_delta_mavg | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 3.336858e-07 | \n",
- " 0.835281 | \n",
- " 0.735671 | \n",
- " 0.043515 | \n",
- " 1.052428e-02 | \n",
- " 1.052428e-02 | \n",
- " 1.052428e-02 | \n",
- " 0.088640 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0 | \n",
- " 6.673716e-07 | \n",
- " 0.030266 | \n",
- " 0.437924 | \n",
- " 0.207004 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 0.228756 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0 | \n",
- " 6.673716e-07 | \n",
- " 0.501513 | \n",
- " 0.770557 | \n",
- " 0.060860 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 0.067256 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0 | \n",
- " 6.673716e-07 | \n",
- " 0.668691 | \n",
- " 0.309189 | \n",
- " 0.117757 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 0.130131 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0 | \n",
- " 6.673716e-07 | \n",
- " 0.671936 | \n",
- " 0.773186 | \n",
- " 0.189745 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 6.673716e-07 | \n",
- " 0.209683 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 21152 | \n",
- " 4 | \n",
- " 1.403816e-03 | \n",
- " 0.909331 | \n",
- " 0.728987 | \n",
- " 0.045351 | \n",
- " 2.677114e-03 | \n",
- " 2.677114e-03 | \n",
- " 2.677114e-03 | \n",
- " 0.160770 | \n",
- "
\n",
- " \n",
- " 21153 | \n",
- " 4 | \n",
- " 1.718482e-03 | \n",
- " 0.476548 | \n",
- " 0.233572 | \n",
- " 0.084373 | \n",
- " 2.240900e-03 | \n",
- " 2.240900e-03 | \n",
- " 2.240900e-03 | \n",
- " 0.138877 | \n",
- "
\n",
- " \n",
- " 21154 | \n",
- " 4 | \n",
- " 2.564709e-03 | \n",
- " 0.560456 | \n",
- " 0.835585 | \n",
- " 0.147691 | \n",
- " 8.005345e-03 | \n",
- " 8.005345e-03 | \n",
- " 8.005345e-03 | \n",
- " 0.176319 | \n",
- "
\n",
- " \n",
- " 21155 | \n",
- " 4 | \n",
- " 3.218733e-03 | \n",
- " 0.448547 | \n",
- " 0.643785 | \n",
- " 0.040216 | \n",
- " 1.141806e-03 | \n",
- " 1.141806e-03 | \n",
- " 1.141806e-03 | \n",
- " 0.137911 | \n",
- "
\n",
- " \n",
- " 21156 | \n",
- " 8 | \n",
- " 3.098940e-03 | \n",
- " 0.266895 | \n",
- " 0.270363 | \n",
- " 0.097779 | \n",
- " 3.098940e-03 | \n",
- " 3.098940e-03 | \n",
- " 3.098940e-03 | \n",
- " 0.108053 | \n",
- "
\n",
- " \n",
- "
\n",
- "
21157 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " category amount age_at_transaction days_until_card_expires \\\n",
- "0 0 3.336858e-07 0.835281 0.735671 \n",
- "1 0 6.673716e-07 0.030266 0.437924 \n",
- "2 0 6.673716e-07 0.501513 0.770557 \n",
- "3 0 6.673716e-07 0.668691 0.309189 \n",
- "4 0 6.673716e-07 0.671936 0.773186 \n",
- "... ... ... ... ... \n",
- "21152 4 1.403816e-03 0.909331 0.728987 \n",
- "21153 4 1.718482e-03 0.476548 0.233572 \n",
- "21154 4 2.564709e-03 0.560456 0.835585 \n",
- "21155 4 3.218733e-03 0.448547 0.643785 \n",
- "21156 8 3.098940e-03 0.266895 0.270363 \n",
- "\n",
- " loc_delta trans_volume_mstd trans_volume_mavg trans_freq \\\n",
- "0 0.043515 1.052428e-02 1.052428e-02 1.052428e-02 \n",
- "1 0.207004 6.673716e-07 6.673716e-07 6.673716e-07 \n",
- "2 0.060860 6.673716e-07 6.673716e-07 6.673716e-07 \n",
- "3 0.117757 6.673716e-07 6.673716e-07 6.673716e-07 \n",
- "4 0.189745 6.673716e-07 6.673716e-07 6.673716e-07 \n",
- "... ... ... ... ... \n",
- "21152 0.045351 2.677114e-03 2.677114e-03 2.677114e-03 \n",
- "21153 0.084373 2.240900e-03 2.240900e-03 2.240900e-03 \n",
- "21154 0.147691 8.005345e-03 8.005345e-03 8.005345e-03 \n",
- "21155 0.040216 1.141806e-03 1.141806e-03 1.141806e-03 \n",
- "21156 0.097779 3.098940e-03 3.098940e-03 3.098940e-03 \n",
- "\n",
- " loc_delta_mavg \n",
- "0 0.088640 \n",
- "1 0.228756 \n",
- "2 0.067256 \n",
- "3 0.130131 \n",
- "4 0.209683 \n",
- "... ... \n",
- "21152 0.160770 \n",
- "21153 0.138877 \n",
- "21154 0.176319 \n",
- "21155 0.137911 \n",
- "21156 0.108053 \n",
- "\n",
- "[21157 rows x 9 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_val"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The feature view and training dataset are now visible in the UI\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ⏭️ **Next:** Part 03 \n",
- "\n",
- "In the following notebook, we will train a model on the dataset we created in this notebook and have quick overview of the lineage."
- ]
- }
- ],
- "metadata": {
- "interpreter": {
- "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/docs/tutorials/fraud_batch/3_model_training.ipynb b/docs/tutorials/fraud_batch/3_model_training.ipynb
deleted file mode 100755
index 626e26501..000000000
--- a/docs/tutorials/fraud_batch/3_model_training.ipynb
+++ /dev/null
@@ -1,631 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "543717fc",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d5db0af3",
- "metadata": {},
- "source": [
- "# Part 03: Model training & UI Exploration\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_batch/3_model_training.ipynb)\n",
- "\n",
- "In this last notebook, we will train a model on the dataset we created in the previous tutorial. We will train our model using standard Python and Scikit-learn, although it could just as well be trained with other machine learning frameworks such as PySpark, TensorFlow, and PyTorch. We will also show some of the exploration that can be done in Hopsworks, notably the search functions and the lineage.\n",
- "\n",
- "## 🗒️ This notebook is divided in 3 main sections:\n",
- "1. **Loading the training data**\n",
- "2. **Train the model**\n",
- "3. **Explore feature groups and views** via the UI.\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "79d97c37",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Connected. Call `.close()` to terminate connection gracefully.\n",
- "\n",
- "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/124\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Connected. Call `.close()` to terminate connection gracefully.\n"
- ]
- }
- ],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f0d8bfb5",
- "metadata": {},
- "source": [
- "## ✨ Load Training Data \n",
- "\n",
- "First, we'll need to fetch the training dataset that we created in the previous notebook. We will use January - February data training and testing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "c2702114",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "\n",
- "# Load data.\n",
- "feature_view = fs.get_feature_view(\"transactions_view\", 1)\n",
- "X_train, y_train, X_val, y_val, X_test, y_test = feature_view.get_train_validation_test_split(1)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "36070a3c",
- "metadata": {},
- "source": [
- "We will train a model to predict `fraud_label` given the rest of the features."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "94acc187",
- "metadata": {},
- "source": [
- "Let's check the distribution of our target label."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "2dc84cc8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "fraud_label\n",
- "0 0.998545\n",
- "1 0.001455\n",
- "dtype: float64"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "y_train.value_counts(normalize=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e546c204",
- "metadata": {},
- "source": [
- "Notice that the distribution is extremely skewed, which is natural considering that fraudulent transactions make up a tiny part of all transactions. Thus we should somehow address the class imbalance. There are many approaches for this, such as weighting the loss function, over- or undersampling, creating synthetic data, or modifying the decision threshold. In this example, we'll use the simplest method which is to just supply a class weight parameter to our learning algorithm. The class weight will affect how much importance is attached to each class, which in our case means that higher importance will be placed on positive (fraudulent) samples."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "39fe274e",
- "metadata": {},
- "source": [
- "## 🏃 Train Model\n",
- "\n",
- "Next we'll train a model. Here, we set the class weight of the positive class to be twice as big as the negative class."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "70bcd17b",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "LogisticRegression(class_weight={0: 0.09999999999999998, 1: 0.9},\n",
- " solver='liblinear')"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Train model.\n",
- "pos_class_weight = 0.9\n",
- "clf = LogisticRegression(class_weight={0: 1.0 - pos_class_weight, 1: pos_class_weight}, solver='liblinear')\n",
- "clf.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c3bbd9c3",
- "metadata": {},
- "source": [
- "Let's see how well it performs on our validation data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "6fbd02f5",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " precision recall f1-score support\n",
- "\n",
- " 0 1.00 1.00 1.00 21132\n",
- " 1 0.00 0.00 0.00 25\n",
- "\n",
- " accuracy 1.00 21157\n",
- " macro avg 0.50 0.50 0.50 21157\n",
- "weighted avg 1.00 1.00 1.00 21157\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
- "UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
- "UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n"
- ]
- }
- ],
- "source": [
- "from sklearn.metrics import classification_report\n",
- "\n",
- "preds = clf.predict(X_val)\n",
- "\n",
- "print(classification_report(y_val, preds))\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4e32237f",
- "metadata": {},
- "source": [
- "## Use the model to score transactions \n",
- "We trained model based on January - February data. Now lets retrieve March data and score whether transactions are fraudulend or not \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "1b284826",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "VersionWarning: No training dataset version was provided to initialise batch scoring . Defaulting to version 1.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-06-20 09:44:08,897 INFO: USE `robin100_featurestore`\n",
- "2022-06-20 09:44:09,915 INFO: WITH right_fg0 AS (SELECT *\n",
- "FROM (SELECT `fg1`.`category` `category`, `fg1`.`amount` `amount`, `fg1`.`age_at_transaction` `age_at_transaction`, `fg1`.`days_until_card_expires` `days_until_card_expires`, `fg1`.`loc_delta` `loc_delta`, `fg1`.`cc_num` `join_pk_cc_num`, `fg1`.`datetime` `join_evt_datetime`, `fg0`.`trans_volume_mstd` `trans_volume_mstd`, `fg0`.`trans_volume_mavg` `trans_volume_mavg`, `fg0`.`trans_freq` `trans_freq`, `fg0`.`loc_delta_mavg` `loc_delta_mavg`, RANK() OVER (PARTITION BY `fg1`.`cc_num`, `fg1`.`datetime` ORDER BY `fg0`.`datetime` DESC) pit_rank_hopsworks\n",
- "FROM `robin100_featurestore`.`transactions_1` `fg1`\n",
- "INNER JOIN `robin100_featurestore`.`transactions_4h_aggs_1` `fg0` ON `fg1`.`cc_num` = `fg0`.`cc_num` AND `fg1`.`datetime` >= `fg0`.`datetime`\n",
- "WHERE `fg1`.`datetime` >= 1641164401000 AND `fg1`.`datetime` <= 1648763999000) NA\n",
- "WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`category` `category`, `right_fg0`.`amount` `amount`, `right_fg0`.`age_at_transaction` `age_at_transaction`, `right_fg0`.`days_until_card_expires` `days_until_card_expires`, `right_fg0`.`loc_delta` `loc_delta`, `right_fg0`.`trans_volume_mstd` `trans_volume_mstd`, `right_fg0`.`trans_volume_mavg` `trans_volume_mavg`, `right_fg0`.`trans_freq` `trans_freq`, `right_fg0`.`loc_delta_mavg` `loc_delta_mavg`\n",
- "FROM right_fg0)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy\n"
- ]
- }
- ],
- "source": [
- "from datetime import datetime\n",
- "date_format = \"%Y-%m-%d %H:%M:%S\"\n",
- "# Create training datasets based event time filter\n",
- "start_time = int(float(datetime.strptime(\"2022-01-03 00:00:01\", date_format).timestamp()) * 1000)\n",
- "end_time = int(float(datetime.strptime(\"2022-03-31 23:59:59\", date_format).timestamp()) * 1000)\n",
- "\n",
- "march_transactions = feature_view.get_batch_data(start_time = start_time, end_time = end_time)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "98ade6cf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " category | \n",
- " amount | \n",
- " age_at_transaction | \n",
- " days_until_card_expires | \n",
- " loc_delta | \n",
- " trans_volume_mstd | \n",
- " trans_volume_mavg | \n",
- " trans_freq | \n",
- " loc_delta_mavg | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 4 | \n",
- " 0.003120 | \n",
- " 0.091597 | \n",
- " 0.139747 | \n",
- " 0.000000 | \n",
- " 0.003120 | \n",
- " 0.003120 | \n",
- " 0.003120 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2 | \n",
- " 0.002173 | \n",
- " 0.091615 | \n",
- " 0.139474 | \n",
- " 0.122200 | \n",
- " 0.002173 | \n",
- " 0.002173 | \n",
- " 0.002173 | \n",
- " 0.135041 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4 | \n",
- " 0.000008 | \n",
- " 0.091622 | \n",
- " 0.139367 | \n",
- " 0.120125 | \n",
- " 0.000008 | \n",
- " 0.000008 | \n",
- " 0.000008 | \n",
- " 0.132748 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4 | \n",
- " 0.000047 | \n",
- " 0.091628 | \n",
- " 0.139291 | \n",
- " 0.000000 | \n",
- " 0.000028 | \n",
- " 0.000028 | \n",
- " 0.000028 | \n",
- " 0.066374 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 0.000659 | \n",
- " 0.091725 | \n",
- " 0.137862 | \n",
- " 0.040270 | \n",
- " 0.000659 | \n",
- " 0.000659 | \n",
- " 0.000659 | \n",
- " 0.044502 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 102997 | \n",
- " 0 | \n",
- " 0.000736 | \n",
- " 0.357364 | \n",
- " 0.481294 | \n",
- " 0.228904 | \n",
- " 0.000736 | \n",
- " 0.000736 | \n",
- " 0.000736 | \n",
- " 0.252957 | \n",
- "
\n",
- " \n",
- " 102998 | \n",
- " 0 | \n",
- " 0.002816 | \n",
- " 0.357399 | \n",
- " 0.480778 | \n",
- " 0.166719 | \n",
- " 0.002816 | \n",
- " 0.002816 | \n",
- " 0.002816 | \n",
- " 0.184238 | \n",
- "
\n",
- " \n",
- " 102999 | \n",
- " 0 | \n",
- " 0.002934 | \n",
- " 0.357403 | \n",
- " 0.480721 | \n",
- " 0.166874 | \n",
- " 0.002875 | \n",
- " 0.002875 | \n",
- " 0.002875 | \n",
- " 0.184323 | \n",
- "
\n",
- " \n",
- " 103000 | \n",
- " 0 | \n",
- " 0.010322 | \n",
- " 0.357470 | \n",
- " 0.479735 | \n",
- " 0.001149 | \n",
- " 0.010322 | \n",
- " 0.010322 | \n",
- " 0.010322 | \n",
- " 0.001270 | \n",
- "
\n",
- " \n",
- " 103001 | \n",
- " 0 | \n",
- " 0.000592 | \n",
- " 0.357604 | \n",
- " 0.477760 | \n",
- " 0.166690 | \n",
- " 0.000592 | \n",
- " 0.000592 | \n",
- " 0.000592 | \n",
- " 0.184206 | \n",
- "
\n",
- " \n",
- "
\n",
- "
103002 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " category amount age_at_transaction days_until_card_expires \\\n",
- "0 4 0.003120 0.091597 0.139747 \n",
- "1 2 0.002173 0.091615 0.139474 \n",
- "2 4 0.000008 0.091622 0.139367 \n",
- "3 4 0.000047 0.091628 0.139291 \n",
- "4 4 0.000659 0.091725 0.137862 \n",
- "... ... ... ... ... \n",
- "102997 0 0.000736 0.357364 0.481294 \n",
- "102998 0 0.002816 0.357399 0.480778 \n",
- "102999 0 0.002934 0.357403 0.480721 \n",
- "103000 0 0.010322 0.357470 0.479735 \n",
- "103001 0 0.000592 0.357604 0.477760 \n",
- "\n",
- " loc_delta trans_volume_mstd trans_volume_mavg trans_freq \\\n",
- "0 0.000000 0.003120 0.003120 0.003120 \n",
- "1 0.122200 0.002173 0.002173 0.002173 \n",
- "2 0.120125 0.000008 0.000008 0.000008 \n",
- "3 0.000000 0.000028 0.000028 0.000028 \n",
- "4 0.040270 0.000659 0.000659 0.000659 \n",
- "... ... ... ... ... \n",
- "102997 0.228904 0.000736 0.000736 0.000736 \n",
- "102998 0.166719 0.002816 0.002816 0.002816 \n",
- "102999 0.166874 0.002875 0.002875 0.002875 \n",
- "103000 0.001149 0.010322 0.010322 0.010322 \n",
- "103001 0.166690 0.000592 0.000592 0.000592 \n",
- "\n",
- " loc_delta_mavg \n",
- "0 0.000000 \n",
- "1 0.135041 \n",
- "2 0.132748 \n",
- "3 0.066374 \n",
- "4 0.044502 \n",
- "... ... \n",
- "102997 0.252957 \n",
- "102998 0.184238 \n",
- "102999 0.184323 \n",
- "103000 0.001270 \n",
- "103001 0.184206 \n",
- "\n",
- "[103002 rows x 9 columns]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "march_transactions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "cf708c28",
- "metadata": {},
- "outputs": [],
- "source": [
- "predictions = clf.predict(march_transactions)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "c9c42003",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0, 0, 0, ..., 0, 0, 0])"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "predictions"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d8c6b712",
- "metadata": {
- "jp-MarkdownHeadingCollapsed": true,
- "tags": []
- },
- "source": [
- "## 👓 Exploration\n",
- "In the Hopsworks feature store, the metadata allows for multiple levels of explorations and review. Here we will show a few of those capacities. \n",
- "\n",
- "### 🔎 Search \n",
- "Using the search function in the ui, you can query any aspect of the feature groups, feature_view and training data that was previously created.\n",
- "\n",
- "### 📊 Statistics \n",
- "We can also enable statistics in one or all the feature groups."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "2b1eed95",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Statistics Job started successfully, you can follow the progress at https://c.app.hopsworks.ai/p/124/jobs/named/transactions_1_compute_stats_20062022074522/executions\n"
- ]
- }
- ],
- "source": [
- "trans_fg = fs.get_feature_group(\"transactions\", version = 1)\n",
- "trans_fg.statistics_config = {\n",
- " \"enabled\": True,\n",
- " \"histograms\": True,\n",
- " \"correlations\": True\n",
- "}\n",
- "\n",
- "trans_fg.update_statistics_config()\n",
- "trans_fg.compute_statistics()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ee9d4ce0",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "\n",
- "### ⛓️ Lineage \n",
- "In all the feature groups and feature view you can look at the relation between each abstractions; what feature group created which training dataset and that is used in which model.\n",
- "This allows for a clear undestanding of the pipeline in relation to each element. "
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ee42a342",
- "metadata": {},
- "source": [
- "## 🎁 Wrapping things up \n",
- "\n",
- "We have now performed a simple training with training data that we have created in the feature store. This concludes the fisrt module and introduction to the core aspect of the feauture store. In the second module we will introduce streaming and external feature groups for a similar fraud use case."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/docs/tutorials/fraud_online/1_feature_groups.ipynb b/docs/tutorials/fraud_online/1_feature_groups.ipynb
deleted file mode 100755
index 370815545..000000000
--- a/docs/tutorials/fraud_online/1_feature_groups.ipynb
+++ /dev/null
@@ -1,403 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "dba770f4",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "86f66405",
- "metadata": {},
- "source": [
- "# Part 01: Load, Engineer & Connect\n",
- "\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_online/1_feature_groups.ipynb)\n",
- "\n",
- "This is the first part of the quick start series of tutorials about Hopsworks Feature Store. As part of this first module, we will work with data related to credit card transactions. \n",
- "The objective of this tutorial is to demonstrate how to work with the **Hopworks Feature Store** for batch data with a goal of training and deploying a model that can predict fraudulent transactions.\n",
- "\n",
- "## 🗒️ This notebook is divided in 3 sections:\n",
- "1. Loading the data and feature engineeing,\n",
- "2. Connect to the Hopsworks feature store,\n",
- "3. Create feature groups and upload them to the feature store.\n",
- "\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "91fef88b",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install -U hopsworks==3.0.0rc5 --quiet"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "86cd6dca",
- "metadata": {},
- "source": [
- "First of all we will load the data and do some feature engineering on it."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "747920f7",
- "metadata": {},
- "source": [
- "## 💽 Loading the Data \n",
- "\n",
- "The data we will use comes from three different CSV files:\n",
- "\n",
- "- `credit_cards.csv`: credit card information such as the expiration date and provider.\n",
- "- `transactions.csv`: events containing information about when a credit card was used, such as a timestamp, location, and the amount spent. A boolean fraud_label variable (True/False) tells us whether a transaction was fraudulent or not.\n",
- "- `profiles.csv`: credit card user information such as birthdate and city of residence.\n",
- "\n",
- "In a production system, these CSV files would originate from separate data sources or tables, and probably separate data pipelines. **All three files have a common credit card number column cc_num, which you will use later to join features together from the different datasets.**\n",
- "\n",
- "Now, you can go ahead and load the data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bc5ce3a2",
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "from datetime import datetime, timedelta\n",
- " \n",
- "credit_cards_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_online/credit_cards.csv\")\n",
- "credit_cards_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ba8a61a7",
- "metadata": {},
- "outputs": [],
- "source": [
- "profiles_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_online/profiles.csv\", parse_dates=[\"birthdate\"])\n",
- "profiles_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "776d60b9",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_online/transactions.csv\", parse_dates=[\"datetime\"])\n",
- "trans_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d3077178",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df = trans_df[trans_df.category == \"Cash Withdrawal\"].reset_index(level=0, drop=True)\n",
- "trans_df[\"country\"] = trans_df[\"country\"].fillna(\"US\")\n",
- "profiles_df = profiles_df[profiles_df.cc_num.isin(trans_df.cc_num.unique())].reset_index(level=0, drop=True)\n",
- "credit_cards_df = credit_cards_df[credit_cards_df.cc_num.isin(trans_df.cc_num.unique())].reset_index(level=0, drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a271cad8",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df.sort_values([\"datetime\",\"cc_num\"], inplace=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "97f66a0b",
- "metadata": {},
- "source": [
- "## 🛠️ Feature Engineering \n",
- "\n",
- "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning we will create additional features based on these patterns. In particular, we will create two types of features:\n",
- "1. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles.csv` with the `datetime` feature from `transactions.csv`.\n",
- "2. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.\n",
- "\n",
- "Let's start with the first category."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3a7d3193",
- "metadata": {},
- "source": [
- "Now you are ready to start by computing the distance between consecutive transactions, which we will call `loc_delta`.\n",
- "Here we use the [Haversine distance](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html?highlight=haversine#sklearn.metrics.pairwise.haversine_distances) to quantify the distance between two longitude and latitude coordinates."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "165f5f47",
- "metadata": {},
- "outputs": [],
- "source": [
- "from math import radians\n",
- "\n",
- "# Feature engineering.\n",
- "trans_df.sort_values(\"datetime\", inplace=True)\n",
- "trans_df[[\"longitude\", \"latitude\"]] = trans_df[[\"longitude\", \"latitude\"]].applymap(radians)\n",
- "\n",
- "def haversine(long, lat, shift):\n",
- " \"\"\"Compute Haversine distance between each consecutive coordinate in (long, lat).\"\"\"\n",
- "\n",
- " long_shifted = long.shift(shift)\n",
- " lat_shifted = lat.shift(shift)\n",
- " long_diff = long_shifted - long\n",
- " lat_diff = lat_shifted - lat\n",
- "\n",
- " a = np.sin(lat_diff/2.0)**2\n",
- " b = np.cos(lat) * np.cos(lat_shifted) * np.sin(long_diff/2.0)**2\n",
- " c = 2*np.arcsin(np.sqrt(a + b))\n",
- "\n",
- " return c\n",
- "\n",
- "def time_delta(datetime_value, shift):\n",
- " \"\"\"Compute time difference between each consecutive transaction.\"\"\"\n",
- "\n",
- " time_shifted = datetime_value.shift(shift)\n",
- " return time_shifted\n",
- "\n",
- "trans_df[\"loc_delta_t_plus_1\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : haversine(x[\"longitude\"], x[\"latitude\"], 1))\\\n",
- " .reset_index(level=0, drop=True)\\\n",
- " .fillna(0)\n",
- "\n",
- "trans_df[\"loc_delta_t_minus_1\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : haversine(x[\"longitude\"], x[\"latitude\"], -1))\\\n",
- " .reset_index(level=0, drop=True)\\\n",
- " .fillna(0)\n",
- "\n",
- "trans_df[\"time_delta_t_plus_1\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : time_delta(x[\"datetime\"], 1 ))\\\n",
- " .reset_index(level=0, drop=True)\n",
- "\n",
- "trans_df[\"time_delta_t_minus_1\"] = trans_df.groupby(\"cc_num\")\\\n",
- " .apply(lambda x : time_delta(x[\"datetime\"], -1))\\\n",
- " .reset_index(level=0, drop=True)\n",
- "\n",
- "trans_df[\"time_delta_t_plus_1\"] = (trans_df.datetime - trans_df.time_delta_t_plus_1 )/ np.timedelta64(1, 'D')\n",
- "trans_df[\"time_delta_t_minus_1\"] = (trans_df.time_delta_t_minus_1 - trans_df.datetime )/ np.timedelta64(1, 'D')\n",
- "trans_df[\"time_delta_t_plus_1\"] = trans_df.time_delta_t_plus_1.fillna(0)\n",
- "trans_df[\"time_delta_t_minus_1\"] = trans_df.time_delta_t_minus_1.fillna(0) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1755e299",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df = trans_df[[\"tid\",\"datetime\",\"cc_num\",\"category\",\"amount\",\"city\",\"country\",\"fraud_label\",\"loc_delta_t_plus_1\", \"loc_delta_t_minus_1\", \"time_delta_t_plus_1\", \"time_delta_t_minus_1\"]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "81b4b36c",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_df.datetime = trans_df.datetime.values.astype(np.int64) // 10 ** 6\n",
- "trans_df"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d93ace20",
- "metadata": {},
- "source": [
- "## 🪄 Creating Feature Groups \n",
- "\n",
- "A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features. In our case, we will create a feature group for the transaction data and a feature group for the windowed aggregations on the transaction data. Both will have `tid` as primary key, which will allow us to join them when creating a dataset in the next tutorial.\n",
- "\n",
- "Feature groups can also be used to define a namespace for features. For instance, in a real-life setting we would likely want to experiment with different window lengths. In that case, we can create feature groups with identical schema for each window length. \n",
- "\n",
- "Before you can create a feature group you need to connect to our feature store."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7997d511",
- "metadata": {},
- "outputs": [],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4aa97688",
- "metadata": {},
- "source": [
- "To create a feature group we need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8be295ac",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_fg = fs.get_or_create_feature_group(\n",
- " name=\"transactions_online_fg\",\n",
- " version=1,\n",
- " description=\"Transaction data\",\n",
- " primary_key=['cc_num'],\n",
- " event_time=['datetime'],\n",
- " online_enabled=True\n",
- ")\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9d1c7350",
- "metadata": {},
- "source": [
- "Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n",
- "\n",
- "At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you need to populate it with its associated data using the `insert` function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0ad7d71d",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_fg.insert(trans_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a06ee84d",
- "metadata": {},
- "source": [
- "We can move on and do the same thing for the feature group with our windows aggregation."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dcc4e7af",
- "metadata": {},
- "source": [
- "Click on the hyperlink printed in the cell output above to inspect your feature group in the UI.\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8e8fa1c1",
- "metadata": {},
- "source": [
- "## 👓 Exploration\n",
- "In the Hopsworks feature store, the metadata allows for multiple levels of explorations and review. Here we will show a few of those capacities. \n",
- "\n",
- "### 🔎 Search \n",
- "Using the search function in the ui, you can query any aspect of the feature groups, feature_view and training data that was previously created.\n",
- "\n",
- "### 📊 Statistics \n",
- "You can also enable statistics in the feature groups."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a09bcf81",
- "metadata": {},
- "outputs": [],
- "source": [
- "trans_fg = fs.get_feature_group(\"transactions_online_fg\", version = 1)\n",
- "trans_fg.statistics_config = {\n",
- " \"enabled\": True,\n",
- " \"histograms\": True,\n",
- " \"correlations\": True\n",
- "}\n",
- "\n",
- "trans_fg.update_statistics_config()\n",
- "trans_fg.compute_statistics()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a35c1ef3",
- "metadata": {},
- "source": [
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b9d96f3c",
- "metadata": {},
- "source": [
- "## ⏭️ **Next:** Part 02 \n",
- "\n",
- "In the following notebook you will use our feature groups to create a dataset you can train a model on."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d12c32f5",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "interpreter": {
- "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83"
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/docs/tutorials/fraud_online/2_feature_view_creation.ipynb b/docs/tutorials/fraud_online/2_feature_view_creation.ipynb
deleted file mode 100755
index 8ec86a2f1..000000000
--- a/docs/tutorials/fraud_online/2_feature_view_creation.ipynb
+++ /dev/null
@@ -1,302 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Part 02: Training Data & Feature views\n",
- "\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_online/2_feature_view_creation.ipynb)\n",
- "\n",
- "This is the second part of the quick start series of tutorials about Hopsworks Feature Store. This notebook explains how to read from a feature group and create training dataset within the feature store\n",
- "\n",
- "## 🗒️ In this notebook we will see how to create a training dataset from the feature groups: \n",
- "1. **Select the features** we want to train our model on,\n",
- "2. **How the features should be preprocessed,**\n",
- "3. **Create a dataset** for training fraud detection model.\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 🔪 Feature Selection \n",
- "\n",
- "We start by selecting all the features we want to include for model training/inference."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load feature groups.\n",
- "trans_fg = fs.get_feature_group('transactions_online_fg', version=1)\n",
- "\n",
- "# Select features for training data.\n",
- "ds_query = trans_fg.select([\"fraud_label\", \"loc_delta_t_plus_1\", \"loc_delta_t_minus_1\", \"time_delta_t_plus_1\", \"time_delta_t_minus_1\", \"country\"])\n",
- "\n",
- "# uncomment this if you would like to view query results\n",
- "# ds_query.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Recall that you computed the features in `transactions_fg`. If you had created multiple feature groups with identical schema for different window lengths, and wanted to include them in the join you would need to include a prefix argument in the join to avoid feature name clash. See the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/query_api/#join) for more details."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "🤖 Transformation Functions \n",
- "\n",
- "We will preprocess our data using *min-max scaling* on numerical features and *label encoding* on categorical features. To do this we simply define a mapping between our features and transformation functions. This ensures that transformation functions such as *min-max scaling* are fitted only on the training data (and not the validation/test data), which ensures that there is no data leakage."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load the transformation functions.\n",
- "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n",
- "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
- "\n",
- "# Map features to transformation functions.\n",
- "transformation_functions = {\n",
- " \"loc_delta_t_plus_1\": min_max_scaler, \n",
- " \"loc_delta_t_minus_1\": min_max_scaler, \n",
- " \"time_delta_t_plus_1\": min_max_scaler, \n",
- " \"time_delta_t_minus_1\": min_max_scaler,\n",
- " \"country\": label_encoder\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ⚙️ Feature View Creation \n",
- "\n",
- "The Feature Views allows schema in form of a query with filters, define a model target feature/label and additional transformation functions.\n",
- "In order to create a Feature View we may use `fs.create_feature_view()`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_view = fs.create_feature_view(\n",
- " name='fraud_online_model_view',\n",
- " query=ds_query,\n",
- " labels=[\"fraud_label\"],\n",
- " transformation_functions=transformation_functions\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To view and explore data in the feature view we can retrieve batch data using `get_batch_data()` method "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 🏋️ Training Dataset Creation\n",
- "\n",
- "In Hopsworks training data is a query where the projection (set of features) is determined by the parent FeatureView with an optional snapshot on disk of the data returned by the query.\n",
- "\n",
- "**Training Dataset may contain splits such as:** \n",
- "* Training set - the subset of training data used to train a model.\n",
- "* Validation set - the subset of training data used to evaluate hparams when training a model\n",
- "* Test set - the holdout subset of training data used to evaluate a mode\n",
- "\n",
- "Training dataset is created using `fs.create_training_dataset()` method.\n",
- "\n",
- "**From feature view APIs we can also create training datasts based on even time filters specifing `start_time` and `end_time`** \n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from datetime import datetime\n",
- "date_format = \"%Y-%m-%d %H:%M:%S\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create training datasets based event time filter\n",
- "start_time = int(float(datetime.strptime(\"2022-01-01 00:00:01\", date_format).timestamp()) * 1000)\n",
- "end_time = int(float(datetime.strptime(\"2022-02-28 23:59:59\", date_format).timestamp()) * 1000)\n",
- "\n",
- "td_jan_feb_version, td_job = feature_view.create_training_data(\n",
- " start_time = start_time,\n",
- " end_time = end_time, \n",
- " description = 'transactions_dataset_jan_feb',\n",
- " data_format = \"csv\",\n",
- " coalesce = True,\n",
- " write_options = {'wait_for_job': True},\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "start_time = int(float(datetime.strptime(\"2022-03-01 00:00:01\", date_format).timestamp()) * 1000)\n",
- "end_time = int(float(datetime.strptime(\"2022-03-31 23:59:59\", date_format).timestamp()) * 1000)\n",
- "\n",
- "td_mar_version, td_job = feature_view.create_training_data(\n",
- " start_time = start_time,\n",
- " end_time = end_time,\n",
- " description = 'transactions_dataset_mar',\n",
- " data_format = \"csv\",\n",
- " coalesce = True,\n",
- " write_options = {'wait_for_job': True},\n",
- " )\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 🪝 Training Dataset retreival \n",
- "\n",
- "To retrieve training data from storage (already materialised) or from feature groups direcly we can use `get_training_dataset_splits` or `get_training_dataset` methods. If version is not provided or provided version has not already existed, it creates a new version of training data according to given arguments and returns a dataframe. If version is provided and has already existed, it reads training data from storage or feature groups and returns a dataframe. If split is provided, it reads the specific split."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "train_jan_feb_x, train_jan_feb_y = feature_view.get_training_data(td_jan_feb_version)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "test_mar_x, test_mar_y = feature_view.get_training_data(td_mar_version)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "train_jan_feb_x"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "test_mar_x"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The feature view and training dataset are now visible in the UI\n",
- "\n",
- "\n",
- "\n",
- "### ⛓️ Lineage \n",
- "In all the feature groups and feature view you can look at the relation between each abstractions; what feature group created which training dataset and that is used in which model.\n",
- "This allows for a clear undestanding of the pipeline in relation to each element. \n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ⏭️ **Next:** Part 03 \n",
- "\n",
- "In the following notebook, you will train a model on the dataset you created in this notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "interpreter": {
- "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/tutorials/fraud_online/3_model_training.ipynb b/docs/tutorials/fraud_online/3_model_training.ipynb
deleted file mode 100755
index 9409ba04f..000000000
--- a/docs/tutorials/fraud_online/3_model_training.ipynb
+++ /dev/null
@@ -1,555 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "65c1b926",
- "metadata": {},
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1a97d239",
- "metadata": {},
- "source": [
- "# Part 03: Model training & UI Exploration\n",
- "\n",
- "[](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/fraud_online/3_model_training.ipynb)\n",
- "\n",
- "In this last notebook, we will train a model on the dataset we created in the previous tutorial. We will train our model using standard Python and Scikit-learn, although it could just as well be trained with other machine learning frameworks such as PySpark, TensorFlow, and PyTorch. We will also show some of the exploration that can be done in Hopsworks, notably the search functions and the lineage.\n",
- "\n",
- "## 🗒️ This notebook is divided in 5 main sections:\n",
- "1. **Loading the training data**\n",
- "2. **Train the model**\n",
- "3. **Register model to Hopsworks model registry**.\n",
- "4. **Deploy the model on KServe behind Hopsworks for real-time inference requests**.\n",
- "5. **Test model deployment and use model serving rest APIs**.\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "62f98488",
- "metadata": {},
- "outputs": [],
- "source": [
- "import hopsworks\n",
- "\n",
- "project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f41f8378",
- "metadata": {},
- "outputs": [],
- "source": [
- "feature_view = fs.get_feature_view(\"fraud_online_model_view\", 1)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a5113b97",
- "metadata": {},
- "source": [
- "## ✨ Load Training Data \n",
- "\n",
- "First, we'll need to fetch the training dataset that we created in the previous notebook. We will use January - February data training and testing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5f33dd35",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "\n",
- "train_jan_feb_x, train_jan_feb_y = feature_view.get_training_data(1)\n",
- "test_mar_x, test_mar_y = feature_view.get_training_data(2)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bb68bd57",
- "metadata": {},
- "source": [
- "## 🏃 Train Model\n",
- "\n",
- "Next we'll train a model. Here, we set the class weight of the positive class to be twice as big as the negative class."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "01e6127d",
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "from sklearn.ensemble import IsolationForest\n",
- "\n",
- "# fit the model\n",
- "clf = IsolationForest(max_samples=100)\n",
- "clf.fit(train_jan_feb_x)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "793f0cd1",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Train Predictions\n",
- "y_pred_train = clf.predict(train_jan_feb_x)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cf5d9191",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Test Predictions\n",
- "y_pred_test = clf.predict(test_mar_x)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "39efab3e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.metrics import confusion_matrix, f1_score\n",
- "from matplotlib import pyplot\n",
- "import seaborn as sn\n",
- "\n",
- "%matplotlib inline\n",
- "\n",
- "if_cm=confusion_matrix(test_mar_y, y_pred_test)\n",
- "pd.DataFrame(if_cm)\n",
- "df_cm = pd.DataFrame(if_cm, ['step', 'True Normal', 'True Fraud'],['Pred Normal', 'step', 'Pred Fraud'])\n",
- "df_cm.drop(index=\"step\",inplace=True)\n",
- "df_cm.drop(\"step\", axis=1, inplace=True)\n",
- "\n",
- "pyplot.figure(figsize = (8,4))\n",
- "sn.set(font_scale=1.4)#for label size\n",
- "sn.heatmap(df_cm, annot=True,annot_kws={\"size\": 16},fmt='g')# font size"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "405861cc",
- "metadata": {},
- "outputs": [],
- "source": [
- "if_cm=confusion_matrix(train_jan_feb_y, y_pred_train)\n",
- "pd.DataFrame(if_cm)\n",
- "df_cm = pd.DataFrame(if_cm, ['step', 'True Normal', 'True Fraud'],['Pred Normal', 'step', 'Pred Fraud'])\n",
- "df_cm.drop(index=\"step\",inplace=True)\n",
- "df_cm.drop(\"step\", axis=1, inplace=True)\n",
- "\n",
- "pyplot.figure(figsize = (8,4))\n",
- "sn.set(font_scale=1.4)#for label size\n",
- "sn.heatmap(df_cm, annot=True,annot_kws={\"size\": 16},fmt='g')# font size"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d154b6e3",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.metrics import f1_score\n",
- "# Compute f1 score\n",
- "metrics = {\"fscore\": f1_score(test_mar_y, y_pred_test, average='micro')}\n",
- "metrics"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0808daed",
- "metadata": {},
- "source": [
- "\n",
- "## Register model\n",
- "\n",
- "One of the features in Hopsworks is the model registry. This is where we can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.\n",
- "\n",
- "Let's connect to the model registry using the HSML library from Hopsworks."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f56939c5",
- "metadata": {},
- "outputs": [],
- "source": [
- "mr = project.get_model_registry()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cb395a54",
- "metadata": {},
- "outputs": [],
- "source": [
- "import joblib\n",
- "\n",
- "joblib.dump(clf, 'model.pkl')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "094956ac",
- "metadata": {},
- "source": [
- "The model needs to be set up with a Model Schema, which describes the inputs and outputs for a model.\n",
- "\n",
- "A Model Schema can be automatically generated from training examples, as shown below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "41394294",
- "metadata": {},
- "outputs": [],
- "source": [
- "from hsml.schema import Schema\n",
- "from hsml.model_schema import ModelSchema\n",
- "\n",
- "input_schema = Schema(train_jan_feb_x)\n",
- "output_schema = Schema(train_jan_feb_y)\n",
- "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n",
- "\n",
- "model_schema.to_dict()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c8f23250",
- "metadata": {},
- "outputs": [],
- "source": [
- "test_credit_card = [4467360740682089]\n",
- "model = mr.sklearn.create_model(\n",
- " name=\"fraud_online_tutorial_model\",\n",
- " metrics=metrics,\n",
- " description=\"Isolation forest anomaly detection model\",\n",
- " input_example = test_credit_card,\n",
- " model_schema=model_schema\n",
- ")\n",
- "\n",
- "model.save('model.pkl')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "80709ea1",
- "metadata": {},
- "source": [
- "## Deploy model\n",
- "### About Model Serving\n",
- "Models can be served via KFServing or \"default\" serving, which means a Docker container exposing a Flask server. For KFServing models, or models written in Tensorflow, you do not need to write a prediction file (see the section below). However, for sklearn models using default serving, you do need to proceed to write a prediction file.\n",
- "\n",
- "In order to use KFServing, you must have Kubernetes installed and enabled on your cluster.\n",
- "\n",
- "### Create the Prediction File\n",
- "In order to deploy a model, you need to write a Python file containing the logic to return a prediction from the model. Don't worry, this is usually a matter of just modifying some paths in a template script. An example can be seen in the code block below, where we have taken this Scikit-learn template script and changed two paths (see comments)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ea08b207",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%writefile predict_example.py\n",
- "import os\n",
- "import hsfs\n",
- "from sklearn.externals import joblib\n",
- "\n",
- "class Predict(object):\n",
- "\n",
- " def __init__(self):\n",
- " \"\"\" Initializes the serving state, reads a trained model\"\"\" \n",
- " # get feature store handle\n",
- " fs_conn = hsfs.connection()\n",
- " self.fs = fs_conn.get_feature_store()\n",
- " \n",
- " # get feature views\n",
- " self.fv = self.fs.get_feature_view(\"fraud_online_model_view\", 1)\n",
- " \n",
- " # initialise serving\n",
- " self.fv.init_serving(1)\n",
- "\n",
- " # load the trained model\n",
- " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/model.pkl\")\n",
- " print(\"Initialization Complete\")\n",
- "\n",
- " def predict(self, inputs):\n",
- " \"\"\" Serves a prediction request usign a trained model\"\"\"\n",
- " return self.model.predict(self.fv.get_feature_vector({\"cc_num\": inputs[0]})).tolist() # Numpy Arrays are not JSON serializable\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0eef6b74",
- "metadata": {},
- "source": [
- "If you wonder why we use the path Models/fraud_tutorial_model/1/model.pkl, it is useful to know that the Data Sets tab in the Hopsworks UI lets you browse among the different files in the project. Registered models will be found underneath the Models directory. Since we saved our model with the name fraud_tutorial_model, that's the directory we should look in. 1 is just the version of the model we want to deploy.\n",
- "\n",
- "This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bc9e59ce",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "dataset_api = project.get_dataset_api()\n",
- "\n",
- "uploaded_file_path = dataset_api.upload(\"predict_example.py\", \"Models\", overwrite=True)\n",
- "predictor_script_path = os.path.join(\"/Projects\", project.name, uploaded_file_path)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "86399139",
- "metadata": {},
- "source": [
- "## Create the deployment\n",
- "Here, we fetch the model we want from the model registry and define a configuration for the deployment. For the configuration, we need to specify the serving type (default or KFserving) and in this case, since we use default serving and an sklearn model, we need to give the location of the prediction script."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3816d5c3",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Use the model name from the previous notebook.\n",
- "model = mr.get_model(\"fraud_online_tutorial_model\", version=1)\n",
- "\n",
- "# Give it any name you want\n",
- "deployment = model.deploy(\n",
- " name=\"fraudonlinemodeldeployment\", \n",
- " model_server=\"PYTHON\",\n",
- " serving_tool=\"KSERVE\",\n",
- " script_file=predictor_script_path\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8cbe7b79",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"Deployment: \" + deployment.name)\n",
- "deployment.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5655bbed",
- "metadata": {},
- "source": [
- "#### The deployment has now been registered. However, to start it you need to run:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1f4f719e",
- "metadata": {},
- "outputs": [],
- "source": [
- "deployment.start()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "97e396dc",
- "metadata": {},
- "outputs": [],
- "source": [
- "deployment.get_logs()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8ddb53fa",
- "metadata": {},
- "source": [
- "## Using the deployment\n",
- "Let's use the input example that we registered together with the model to query the deployment."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5802d351",
- "metadata": {},
- "outputs": [],
- "source": [
- "data = {\n",
- " \"inputs\": model.input_example\n",
- "}\n",
- "\n",
- "deployment.predict(data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "30fb1d1d",
- "metadata": {},
- "outputs": [],
- "source": [
- "deployment.get_logs()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1375739f",
- "metadata": {},
- "source": [
- "### Use REST endpoint\n",
- "\n",
- "You can also use a REST endpoint for your model. To do this you need to create an API key with 'serving' enabled, and retrieve the endpoint URL from the Model Serving UI.\n",
- "\n",
- "Go to the Model Serving UI and click on the eye icon next to a model to retrieve the endpoint URL. The shorter URL is an internal endpoint that you can only reach from within Hopsworks. If you want to call it from outside, you need one of the longer URLs. \n",
- "\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c3f53c9c",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import requests\n",
- "\n",
- "mr = project.get_model_registry()\n",
- "\n",
- "# Use the model name from the previous notebook.\n",
- "model = mr.get_model(\"fraud_tutorial_model\", version=1)\n",
- "\n",
- "test_inputs = [model.input_example]\n",
- "\n",
- "API_KEY = \"...\" # Put your API key here.\n",
- "MODEL_SERVING_URL = \"...\" # Put model serving endppoint here.\n",
- "HOST_NAME = \"...\" # Put your hopsworks model serving hostname here \n",
- "\n",
- "data = {\"inputs\": test_inputs}\n",
- "headers = {\n",
- " \"Content-Type\": \"application/json\", \"Accept\": \"application/json\",\n",
- " \"Authorization\": f\"ApiKey {API_KEY}\",\n",
- " \"Host\": HOST_NAME}\n",
- "\n",
- "response = requests.post(MODEL_SERVING_URL, verify=False, headers=headers, json=data)\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2190fab6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Now lets test feature vectors from online store\n",
- "data = {\"inputs\": 4467360740682089}\n",
- "response = requests.post(url, verify=False, headers=headers, json=data)\n",
- "response.json()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4b1c78d7",
- "metadata": {},
- "source": [
- "## Stop Deployment\n",
- "To stop the deployment we simply run:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ccde7f76",
- "metadata": {},
- "outputs": [],
- "source": [
- "deployment.stop()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "395f3c2f",
- "metadata": {},
- "source": [
- "## 🎁 Wrapping things up \n",
- "\n",
- "In this module we introduced stream feature group, performed with training data that we have created from feature view and depoyed model in production."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0639f4cb",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/mkdocs.yml b/mkdocs.yml
index add9dc24a..a3adfd1c4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,16 +10,16 @@ edit_uri: ""
nav:
- Home: index.md
- - Getting Started: getting_started/quickstart.ipynb
+ - Getting Started: hopsworks-tutorials/quickstart.ipynb
- Tutorials:
- Fraud Batch:
- - 1. Feature Groups: tutorials/fraud_batch/1_feature_groups.ipynb
- - 2. Feature View: tutorials/fraud_batch/2_feature_view_creation.ipynb
- - 3. Model Training: tutorials/fraud_batch/3_model_training.ipynb
+ - 1. Feature Groups: hopsworks-tutorials/fraud_batch/1_feature_groups.ipynb
+ - 2. Feature View: hopsworks-tutorials/fraud_batch/2_feature_view_creation.ipynb
+ - 3. Model Training: hopsworks-tutorials/fraud_batch/3_model_training.ipynb
- Fraud Online:
- - 1. Feature Groups: tutorials/fraud_online/1_feature_groups.ipynb
- - 2. Feature View: tutorials/fraud_online/2_feature_view_creation.ipynb
- - 3. Model Training: tutorials/fraud_online/3_model_training.ipynb
+ - 1. Feature Groups: hopsworks-tutorials/fraud_online/1_feature_groups.ipynb
+ - 2. Feature View: hopsworks-tutorials/fraud_online/2_feature_view_creation.ipynb
+ - 3. Model Training: hopsworks-tutorials/fraud_online/3_model_training.ipynb
- Concepts:
- Hopsworks Platform: concepts/hopsworks.md
- Feature Store:
diff --git a/prepare_images.py b/prepare_images.py
new file mode 100644
index 000000000..1a98d073f
--- /dev/null
+++ b/prepare_images.py
@@ -0,0 +1,20 @@
+import argparse
+import shutil
+import os
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--path', '-p', help="path to the hopsworks tutorials dir", type=str, default="docs/hopsworks-tutorials")
+ parser.add_argument('--src', '-s', help="name of directory with images to copy", type=str, default="images")
+
+ args = parser.parse_args()
+
+ except_dirs = [".git", args.src]
+
+ sub_dirs = [
+ element for element in os.listdir(args.path)
+ if os.path.isdir(os.path.join(args.path, element)) and element not in except_dirs
+ ]
+
+ for dst in sub_dirs:
+ shutil.copytree(os.path.join(args.path, args.src), os.path.join(args.path, dst, args.src), dirs_exist_ok=True)