diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d7fa37f..c06c9c3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,10 +9,10 @@ on: jobs: native-py: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11' ] + python-version: [ '3.9', '3.10', '3.11', '3.12' ] max-parallel: 4 steps: diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb index 46bbca8..54a6c2c 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb @@ -13,8 +13,7 @@ "from medcat.cat import CAT\n", "from medcat.meta_cat import MetaCAT\n", "from medcat.config_meta_cat import ConfigMetaCAT\n", - "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n", - "from tokenizers import ByteLevelBPETokenizer" + "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT" ] }, { @@ -31,82 +30,234 @@ }, { "cell_type": "markdown", - "id": "5d0606ec", + "id": "f310cef3", "metadata": {}, "source": [ - "# Set parameters" + "### Load the model pack with MetaCATs\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "dd7a2e97", "metadata": {}, "outputs": [], "source": [ - "# relative path to working_with_cogstack folder\n", - "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", - "# absolute path to working_with_cogstack folder\n", - "base_path = os.path.abspath(_rel_path)\n", - "# Load mct export\n", - "ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n", - "\n", - "mctrainer_export_path = ann_dir + \"\" # name of your mct export\n", - "\n", + "model_pack = '' # .zip model pack location \n", + "mctrainer_export = \"\" # name of your mct export" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "921d5e9e", + "metadata": {}, + "outputs": [], + "source": [ "# Load model\n", - "model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n", - "modelpack = '' # name of modelpack\n", - "model_pack_path = os.path.join(model_dir, modelpack)\n", - " #output_modelpack = model_dir + f\"{today}_trained_model\"\n", + "cat = CAT.load_model_pack(model_pack)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b205d51b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are: 3 meta cat models in this model pack.\n" + ] + } + ], + "source": [ "\n", - "# will be used to date the trained model\n", - "today = str(date.today())\n", - "today = today.replace(\"-\",\"\")\n", + "# Check what meta cat models are in this model pack.\n", + "print(f'There are: {len(cat._meta_cats)} meta cat models in this model pack.')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "31d7632a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Temporality\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Past\": 0,\n", + " \"Recent\": 1,\n", + " \"Future\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(cat._meta_cats[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e9180c4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Presence\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Hypothetical (N/A)\": 1,\n", + " \"Not present (False)\": 0,\n", + " \"Present (True)\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(cat._meta_cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "275ca9ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Experiencer\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Family\": 1,\n", + " \"Other\": 0,\n", + " \"Patient\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(cat._meta_cats[2])" + ] + }, + { + "cell_type": "markdown", + "id": "3047b1d9", + "metadata": {}, + "source": [ + " NOTE: \n", + " The name for the classification task can vary. E.g: The Category Name for 'Experiencer' can be 'Subject', as it has been configured an annoated in MedCATTrainer this way, but the model expects 'Experiencer'\n", + " \n", + " To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n", "\n", - "# Initialise meta_ann models\n", - "if model_pack_path[-4:] == '.zip':\n", - " base_dir_meta_models = model_pack_path[:-4]\n", - "else:\n", - " base_dir_meta_models = model_pack_path\n", + "E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']\n", "\n", - "# Iterate through the meta_models contained in the model\n", - "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", - "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", - " for dirname in dirnames:\n", - " if dirname.startswith('meta_'):\n", - " meta_model_names.append(dirname[5:])" + "Set this list to ensure during training / fine-tuning the model is aware of alternative names for classes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ca00fb0", + "metadata": {}, + "outputs": [], + "source": [ + "print(cat._meta_cats[0].config.general.alternative_category_names)" ] }, { "cell_type": "markdown", - "id": "35aa5605", + "id": "5dba296c", "metadata": {}, "source": [ - "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n", - "\n" + "💡 In case you are using older modelpacks, the above field will be empty. In that case, " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e41964", + "metadata": {}, + "outputs": [], + "source": [ + "# Only run in case the above output is an empty list\n", + "category_name_mapping = [[\"Presence\"],[\"Temporality\",\"Time\"],[\"Experiencer\",\"Subject\"]]\n", + "lookup = {item: group for group in category_name_mapping for item in group}\n", + "\n", + "for meta_model in range(len(cat._meta_cats)):\n", + " cat._meta_cats[meta_model].config.general.alternative_category_names = lookup.get(cat._meta_cats[meta_model].config.general.category_name)" ] }, { "cell_type": "markdown", - "id": "8bf6f5c3", + "id": "12e91f77", "metadata": {}, "source": [ - "Depending on the model pack you have, please run the LSTM model or BERT model section.
\n", - "If you are unsure, use this section to check the model type." + " NOTE: \n", + " The name for the classes can vary too. Some sites may have trained a MetaCAT model for the same task, but called a class value a slightly different name.\n", + " \n", + " E.g: For the Presence task, the class name can be 'Not present (False)' or 'False'\n", + " \n", + " To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n", + "\n", + " E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f6b06e2", + "metadata": {}, + "outputs": [], + "source": [ + "print(cat._meta_cats[0].config.general.alternative_class_names)" + ] + }, + { + "cell_type": "markdown", + "id": "3c97c986", + "metadata": {}, + "source": [ + "💡 In case you are using older modelpacks, the above field will be empty. In that case, please run the following code:" ] }, { "cell_type": "code", "execution_count": null, - "id": "2933f7e1", + "id": "0fdfae70", "metadata": {}, "outputs": [], "source": [ - "for meta_model in meta_model_names:\n", - " config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n", - " with open(config_file, 'r') as jfile:\n", - " config_dict = json.load(jfile)\n", - " print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])" + "# Only run in case the above output is an empty list\n", + "class_name_mapping = {\n", + " \"Temporality\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n", + " \"Time\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n", + " \"Experiencer\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n", + " \"Subject\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n", + " \"Presence\": [[\"Hypothetical (N/A)\", \"Hypothetical\"], [\"Not present (False)\", \"False\"], [\"Present (True)\", \"True\"]]\n", + "}\n", + "\n", + "for meta_model in range(len(cat._meta_cats)):\n", + " cat._meta_cats[meta_model].config.general.alternative_class_names = class_name_mapping[cat._meta_cats[meta_model].config.general.category_name]" ] }, { @@ -124,22 +275,22 @@ "metadata": {}, "outputs": [], "source": [ - "for meta_model in meta_model_names:\n", - " \n", - " # load the meta_model\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n", + "# Train the first meta cat model - 'Temporality' Task.\n", + "meta_cat = cat._meta_cats[0]\n", "\n", - " # changing parameters\n", - " mc.config.train['nepochs'] = 15\n", + "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n", + "meta_cat_task = meta_cat.config.general.category_name\n", + "model_pack_dir = ''\n", + "save_dir_path = os.path.join(model_pack_dir,\"meta_\"+ meta_cat_task)\n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", - " #Ideally this should replace the meta_models inside the modelpack\n", + "# to save the new model elsewhere, uncomment the below line\n", + "#save_dir_path= \"test_meta_\"+meta_cat_task # Where to save the meta_model and results. \n", "\n", - " # train the meta_model\n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", - " \n", - " # Save results\n", - " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" + "# train the meta_model\n", + "results = meta_cat.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", + "\n", + "# Save results\n", + "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_cat_task+'_results.json'), 'w'))" ] }, { @@ -147,7 +298,8 @@ "id": "ab23e424", "metadata": {}, "source": [ - "## If you dont have the model packs, and are training from scratch" + "## If you dont have the model packs, and are training from scratch\n", + "⚠️This is very rare, it is recommended to always use the model packs and then fine-tune them" ] }, { @@ -167,23 +319,22 @@ "\n", "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n", "\n", - "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n", - "#Ideally this should replace the meta_models inside the modelpack\n", + "save_dir_path= \"test_meta_\" + meta_cat_task # Where to save the meta_model and results. \n", "\n", "# Initialise and train meta_model\n", "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n", - "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", "\n", "# Save results\n", - "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" + "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_' + meta_cat_task+'_results.json'), 'w'))" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:cattrainer]", "language": "python", - "name": "python3" + "name": "conda-env-cattrainer-py" }, "language_info": { "codemirror_mode": { @@ -195,7 +346,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb index 7266056..d32d399 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb @@ -1,13 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "ae1fe3b4", - "metadata": {}, - "source": [ - "### This notebook is an advanced tutorial detailing the config changes for optimising the BERT and LSTM models for Experiencer classification task on custom dataset" - ] - }, { "cell_type": "code", "execution_count": 1, @@ -21,8 +13,7 @@ "from medcat.cat import CAT\n", "from medcat.meta_cat import MetaCAT\n", "from medcat.config_meta_cat import ConfigMetaCAT\n", - "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n", - "from tokenizers import ByteLevelBPETokenizer" + "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT" ] }, { @@ -34,125 +25,91 @@ "source": [ "# if you want to enable info level logging\n", "import logging\n", - "logging.basicConfig(level=logging.INFO,force=True)\n", - "logger = logging.getLogger(__name__)" + "logging.basicConfig(level=logging.INFO,force=True)" ] }, { "cell_type": "markdown", - "id": "5d0606ec", + "id": "b1c5b9b0", "metadata": {}, "source": [ - "# Set parameters" + "#### 💡 To understand the model loading and other functionalities, please refer to the 'meta_annotation_training.ipynb' notebook" ] }, { "cell_type": "code", "execution_count": 3, - "id": "dd7a2e97", + "id": "a2c0431f", "metadata": {}, "outputs": [], "source": [ - "# relative path to working_with_cogstack folder\n", - "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", - "# absolute path to working_with_cogstack folder\n", - "base_path = os.path.abspath(_rel_path)\n", - "# Load mct export\n", - "ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n", - "\n", - "mctrainer_export_path = ann_dir + \"\" # name of your mct export\n", - "\n", - "# Load model\n", - "model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n", - "modelpack = '' # name of modelpack\n", - "model_pack_path = os.path.join(model_dir, modelpack)\n", - " #output_modelpack = model_dir + f\"{today}_trained_model\"\n", - "\n", - "# will be used to date the trained model\n", - "today = str(date.today())\n", - "today = today.replace(\"-\",\"\")\n", - "\n", - "# Initialise meta_ann models\n", - "if model_pack_path[-4:] == '.zip':\n", - " base_dir_meta_models = model_pack_path[:-4]\n", - "else:\n", - " base_dir_meta_models = model_pack_path\n", - "\n", - "# Iterate through the meta_models contained in the model\n", - "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", - "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", - " for dirname in dirnames:\n", - " if dirname.startswith('meta_'):\n", - " meta_model_names.append(dirname[5:])" + "model_pack = '' # .zip model pack location\n", + "mctrainer_export = \"\" # name of your mct export" ] }, { "cell_type": "markdown", - "id": "35aa5605", + "id": "808c27c1", "metadata": {}, "source": [ - "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n", - "\n" + "We won't load the models at this stage as they need to be seperately loaded later.
Let's check for meta models in the directory" ] }, { - "cell_type": "markdown", - "id": "699be74b", + "cell_type": "code", + "execution_count": 4, + "id": "675eab49", "metadata": {}, + "outputs": [], "source": [ - "# Class weights " + "# Iterate through the meta_models contained in the model\n", + "meta_model_names = []\n", + "for dirpath, dirnames, filenames in os.walk(model_pack):\n", + " for dirname in dirnames:\n", + " if dirname.startswith('meta_'):\n", + " meta_model_names.append(dirname[5:])\n", + "\n", + "print(\"Meta models:\",meta_model_names)" ] }, { "cell_type": "markdown", - "id": "e624d876", + "id": "9e499198", "metadata": {}, "source": [ + "# Class weights \n", + "\n", "Adjusting class weights to give more importance to specific classes. Generally, class weights are used in favour of minority classes(classes with less number of samples) to boost their performance.\n", "

To use class weights, we have 2 options:\n", "
1. calculate class weights based on class distribution\n", - "
2. using specified class weights" - ] - }, - { - "cell_type": "markdown", - "id": "dc91f7d6", - "metadata": {}, - "source": [ + "
2. using specified class weights\n", "\n", - "#option 1
\n", - "mc.config.train['class_weights'] = []
\n", - "mc.config.train['compute_class_weights'] = True
\n", - "#NOTE: this will only be applicable if mc.config.train.class_weights is empty
\n", + "\n", + "#option 1
\n", + "metacat.config.train['class_weights'] = []
\n", + "metacat.config.train['compute_class_weights'] = True
\n", "
\n", - "#2nd option
\n", - "#using specified class weights
\n", - "mc.config.train['class_weights'] = [0.4,0.3,0.1]
" + "#option 2
\n", + "metacat.config.train['class_weights'] = [0.4,0.3,0.1]
" ] }, { "cell_type": "markdown", - "id": "c217762f", + "id": "fc07f3e9", "metadata": {}, "source": [ - "NOTE: Make sure to correctly map the class weights to their corresponding class index (ID).
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n", + "NOTE: Make sure to correctly map the class weights to their corresponding class index.
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n", "
This will print a dictionary where the class names and their corresponding IDs (indices) are displayed.
\n", "The first position in the class weight list corresponds to the class with ID 0 in the dictionary, and so on." ] }, { "cell_type": "markdown", - "id": "c3002ef0", - "metadata": {}, - "source": [ - "# 2 phase learning for training" - ] - }, - { - "cell_type": "markdown", - "id": "a349af2b", + "id": "6a92aa60", "metadata": {}, "source": [ + "# 2 phase learning for training\n", + "\n", "2 phase learning is used to mitigate class imbalance. In 2 phase learning, the models are trained twice:
\n", "Phase 1: trains for minority class(es) by undersampling data so that there is no class imbalance\n", "
Phase 2: trains for all classes\n", @@ -161,86 +118,87 @@ "
Phase 2 is when the model is expected to learn the majority class as it is trained on the entire dataset.\n", "\n", "Paper reference - https://ieeexplore.ieee.org/document/7533053\n", - "
NOTE: Make sure to use class weights in favour of minority classes with 2 phase learning" + "
Make sure to use class weights in favour of minority classes with 2 phase learning" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8ff613ef", + "execution_count": 5, + "id": "5a86b839", "metadata": {}, "outputs": [], "source": [ "#--------------------------------Phase 1--------------------------------\n", "def run_phase_1(meta_model,class_wt_phase1 = None):\n", " #Loading the pre-defined config for phase 1\n", - " config_ph_1_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph1.json\")\n", + " config_ph_1_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph1.json\")\n", " with open(config_ph_1_path) as f:\n", " config_ph1 = json.load(f)\n", - "\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model),config_dict = config_ph1)\n", + " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph1)\n", "\n", " if class_wt_phase1:\n", " mc.config.train['class_weights'] = class_wt_phase1\n", "\n", - " mc.config.train['nepochs'] = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n", + " #You can change the number of epochs, remember to keep them higher for phase 1\n", + " mc.config.train['nepochs'] = 40 \n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", " # Save results\n", " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n", "\n", "#--------------------------------Phase 2--------------------------------\n", "def run_phase_2(meta_model,class_wt_phase2 = None): \n", " #Loading the pre-defined config for phase 2\n", - " config_ph_2_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph2.json\")\n", + " config_ph_2_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph2.json\")\n", " with open(config_ph_2_path) as f:\n", " config_ph2 = json.load(f)\n", "\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model),config_dict = config_ph2)\n", + " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph2)\n", "\n", " if class_wt_phase2:\n", " mc.config.train['class_weights'] = class_wt_phase2\n", "\n", - " mc.config.train['nepochs'] = 15\n", + " #You can change the number of epochs\n", + " mc.config.train['nepochs'] = 20\n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", " # Save results\n", " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n", "\n", "#--------------------------------Driver--------------------------------\n", - "for meta_model in meta_model_names:\n", - " #To use your own class weights instead of the pre-defined ones for the 2 phases, uncomment the below lines\n", - " '''class_wt_phase1 = []\n", - " class_wt_phase2 = []'''\n", + "# Train the first meta cat model\n", + "meta_model = meta_model_names[0]\n", "\n", - " # Train 2 phase learning\n", - " logger.info(\"\\n********************Beginning Phase 1********************\")\n", - " run_phase_1(meta_model,class_wt_phase1)\n", - " logger.info(\"\\n********************Beginning Phase 2********************\")\n", - " run_phase_2(meta_model,class_wt_phase2)" - ] - }, - { - "cell_type": "markdown", - "id": "b3d43a3b", - "metadata": {}, - "source": [ - "# Oversampling data" + "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n", + "meta_cat_task = meta_model\n", + "save_dir_path = os.path.join(model_pack,\"meta_\"+ meta_cat_task)\n", + "\n", + "# To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n", + "class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n", + "class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n", + "\n", + "\n", + "# Train 2 phase learning\n", + "print(\"*** Training meta cat: \",meta_model)\n", + "print(\"Beginning Phase 1...\")\n", + "run_phase_1(meta_model,class_wt_phase1)\n", + "print(\"Beginning Phase 2...\")\n", + "run_phase_2(meta_model,class_wt_phase2)" ] }, { "cell_type": "markdown", - "id": "ca9b70b3", + "id": "60f0e878", "metadata": {}, "source": [ + "# Generating synthetic data\n", + "\n", "You can generate synthetic data to help mitigate class imbalance.
Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)
NOTE: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. " ] }, { "cell_type": "markdown", - "id": "5835eb2b", + "id": "431e1002", "metadata": {}, "source": [ "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same." @@ -249,22 +207,19 @@ { "cell_type": "code", "execution_count": null, - "id": "8161b602", + "id": "4d07d437", "metadata": {}, "outputs": [], "source": [ "# To run the training with original + synthetic data\n", - "# Follow all the same steps till initializing the metacat model\n", - "\n", - "# Initialise and train meta_model\n", - "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n", + "# Follow all the same steps till and load the model\n", "\n", "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n", "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n", "\n", "synthetic_data_export = [[],[],[]]\n", "\n", - "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n", + "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n", "\n", "# Save results\n", "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" @@ -273,7 +228,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "pytorch_medcat_clean", "language": "python", "name": "python3" }, @@ -287,7 +242,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/medcat/compare_models/compare_annotations.py b/medcat/compare_models/compare_annotations.py index 768bbb6..af7e252 100644 --- a/medcat/compare_models/compare_annotations.py +++ b/medcat/compare_models/compare_annotations.py @@ -14,7 +14,7 @@ class ResultsTally(BaseModel): pt2ch: Optional[Dict[str, Set[str]]] cat_data: dict cui2name: Callable[[str], str] - total_count = 0 + total_count: int = 0 per_cui_count: Dict[str, int] = {} per_cui_acc: Dict[str, float] = {} per_cui_forms: Dict[str, Set[str]] = {} diff --git a/medcat/compare_models/tests/test_compare_annotations.py b/medcat/compare_models/tests/test_compare_annotations.py index 55f19e1..b2b6fa5 100644 --- a/medcat/compare_models/tests/test_compare_annotations.py +++ b/medcat/compare_models/tests/test_compare_annotations.py @@ -39,7 +39,7 @@ def _cui2name(self, cui: str) -> str: def setUp(self) -> None: self.res = compare_annotations.ResultsTally(cat_data={"stats": "don't matter"}, - cui2name=self._cui2name) + cui2name=self._cui2name, pt2ch=None) for entities in self.entities: self.res.count(entities['entities']) diff --git a/requirements.txt b/requirements.txt index a14b289..f476450 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -spacy<3.8.0 -medcat~=1.12.0 +spacy>=3.6.0,<4.0 +medcat~=1.16.0 plotly~=5.19.0 eland==8.12.1 -en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl +en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl ipyfilechooser jupyter_contrib_nbextensions