Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ on:
jobs:
native-py:

runs-on: ubuntu-20.04
runs-on: ubuntu-24.04
strategy:
matrix:
python-version: [ '3.8', '3.9', '3.10', '3.11' ]
python-version: [ '3.9', '3.10', '3.11', '3.12' ]
max-parallel: 4

steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
"from medcat.cat import CAT\n",
"from medcat.meta_cat import MetaCAT\n",
"from medcat.config_meta_cat import ConfigMetaCAT\n",
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
"from tokenizers import ByteLevelBPETokenizer"
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
]
},
{
Expand All @@ -31,82 +30,234 @@
},
{
"cell_type": "markdown",
"id": "5d0606ec",
"id": "f310cef3",
"metadata": {},
"source": [
"# Set parameters"
"### Load the model pack with MetaCATs\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "dd7a2e97",
"metadata": {},
"outputs": [],
"source": [
"# relative path to working_with_cogstack folder\n",
"_rel_path = os.path.join(\"..\", \"..\", \"..\")\n",
"# absolute path to working_with_cogstack folder\n",
"base_path = os.path.abspath(_rel_path)\n",
"# Load mct export\n",
"ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n",
"\n",
"mctrainer_export_path = ann_dir + \"\" # name of your mct export\n",
"\n",
"model_pack = '<enter path to the model pack>' # .zip model pack location \n",
"mctrainer_export = \"<enter mct export location>\" # name of your mct export"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "921d5e9e",
"metadata": {},
"outputs": [],
"source": [
"# Load model\n",
"model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n",
"modelpack = '' # name of modelpack\n",
"model_pack_path = os.path.join(model_dir, modelpack)\n",
" #output_modelpack = model_dir + f\"{today}_trained_model\"\n",
"cat = CAT.load_model_pack(model_pack)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b205d51b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are: 3 meta cat models in this model pack.\n"
]
}
],
"source": [
"\n",
"# will be used to date the trained model\n",
"today = str(date.today())\n",
"today = today.replace(\"-\",\"\")\n",
"# Check what meta cat models are in this model pack.\n",
"print(f'There are: {len(cat._meta_cats)} meta cat models in this model pack.')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "31d7632a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"Category Name\": \"Temporality\",\n",
" \"Description\": \"No description\",\n",
" \"Classes\": {\n",
" \"Past\": 0,\n",
" \"Recent\": 1,\n",
" \"Future\": 2\n",
" },\n",
" \"Model\": \"bert\"\n",
"}\n"
]
}
],
"source": [
"print(cat._meta_cats[0])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e9180c4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"Category Name\": \"Presence\",\n",
" \"Description\": \"No description\",\n",
" \"Classes\": {\n",
" \"Hypothetical (N/A)\": 1,\n",
" \"Not present (False)\": 0,\n",
" \"Present (True)\": 2\n",
" },\n",
" \"Model\": \"bert\"\n",
"}\n"
]
}
],
"source": [
"print(cat._meta_cats[1])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "275ca9ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"Category Name\": \"Experiencer\",\n",
" \"Description\": \"No description\",\n",
" \"Classes\": {\n",
" \"Family\": 1,\n",
" \"Other\": 0,\n",
" \"Patient\": 2\n",
" },\n",
" \"Model\": \"bert\"\n",
"}\n"
]
}
],
"source": [
"print(cat._meta_cats[2])"
]
},
{
"cell_type": "markdown",
"id": "3047b1d9",
"metadata": {},
"source": [
"<b> NOTE: </b> \n",
" The name for the classification task can vary. E.g: The Category Name for 'Experiencer' can be 'Subject', as it has been configured an annoated in MedCATTrainer this way, but the model expects 'Experiencer'\n",
" \n",
" To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
"\n",
"# Initialise meta_ann models\n",
"if model_pack_path[-4:] == '.zip':\n",
" base_dir_meta_models = model_pack_path[:-4]\n",
"else:\n",
" base_dir_meta_models = model_pack_path\n",
"E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']\n",
"\n",
"# Iterate through the meta_models contained in the model\n",
"meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n",
"for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n",
" for dirname in dirnames:\n",
" if dirname.startswith('meta_'):\n",
" meta_model_names.append(dirname[5:])"
"Set this list to ensure during training / fine-tuning the model is aware of alternative names for classes."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ca00fb0",
"metadata": {},
"outputs": [],
"source": [
"print(cat._meta_cats[0].config.general.alternative_category_names)"
]
},
{
"cell_type": "markdown",
"id": "35aa5605",
"id": "5dba296c",
"metadata": {},
"source": [
"Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
"\n"
"💡 In case you are using older modelpacks, the above field will be empty. In that case, "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92e41964",
"metadata": {},
"outputs": [],
"source": [
"# Only run in case the above output is an empty list\n",
"category_name_mapping = [[\"Presence\"],[\"Temporality\",\"Time\"],[\"Experiencer\",\"Subject\"]]\n",
"lookup = {item: group for group in category_name_mapping for item in group}\n",
"\n",
"for meta_model in range(len(cat._meta_cats)):\n",
" cat._meta_cats[meta_model].config.general.alternative_category_names = lookup.get(cat._meta_cats[meta_model].config.general.category_name)"
]
},
{
"cell_type": "markdown",
"id": "8bf6f5c3",
"id": "12e91f77",
"metadata": {},
"source": [
"Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
"If you are unsure, use this section to check the model type."
"<b> NOTE: </b> \n",
" The name for the classes can vary too. Some sites may have trained a MetaCAT model for the same task, but called a class value a slightly different name.\n",
" \n",
" E.g: For the Presence task, the class name can be 'Not present (False)' or 'False'\n",
" \n",
" To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
"\n",
" E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f6b06e2",
"metadata": {},
"outputs": [],
"source": [
"print(cat._meta_cats[0].config.general.alternative_class_names)"
]
},
{
"cell_type": "markdown",
"id": "3c97c986",
"metadata": {},
"source": [
"💡 In case you are using older modelpacks, the above field will be empty. In that case, please run the following code:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2933f7e1",
"id": "0fdfae70",
"metadata": {},
"outputs": [],
"source": [
"for meta_model in meta_model_names:\n",
" config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
" with open(config_file, 'r') as jfile:\n",
" config_dict = json.load(jfile)\n",
" print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
"# Only run in case the above output is an empty list\n",
"class_name_mapping = {\n",
" \"Temporality\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n",
" \"Time\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n",
" \"Experiencer\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n",
" \"Subject\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n",
" \"Presence\": [[\"Hypothetical (N/A)\", \"Hypothetical\"], [\"Not present (False)\", \"False\"], [\"Present (True)\", \"True\"]]\n",
"}\n",
"\n",
"for meta_model in range(len(cat._meta_cats)):\n",
" cat._meta_cats[meta_model].config.general.alternative_class_names = class_name_mapping[cat._meta_cats[meta_model].config.general.category_name]"
]
},
{
Expand All @@ -124,30 +275,31 @@
"metadata": {},
"outputs": [],
"source": [
"for meta_model in meta_model_names:\n",
" \n",
" # load the meta_model\n",
" mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n",
"# Train the first meta cat model - 'Temporality' Task.\n",
"meta_cat = cat._meta_cats[0]\n",
"\n",
" # changing parameters\n",
" mc.config.train['nepochs'] = 15\n",
"# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n",
"meta_cat_task = meta_cat.config.general.category_name\n",
"model_pack_dir = '<enter path to meta model pack>'\n",
"save_dir_path = os.path.join(model_pack_dir,\"meta_\"+ meta_cat_task)\n",
"\n",
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
" #Ideally this should replace the meta_models inside the modelpack\n",
"# to save the new model elsewhere, uncomment the below line\n",
"#save_dir_path= \"test_meta_\"+meta_cat_task # Where to save the meta_model and results. \n",
"\n",
" # train the meta_model\n",
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
" \n",
" # Save results\n",
" json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
"# train the meta_model\n",
"results = meta_cat.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
"\n",
"# Save results\n",
"json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_cat_task+'_results.json'), 'w'))"
]
},
{
"cell_type": "markdown",
"id": "ab23e424",
"metadata": {},
"source": [
"## If you dont have the model packs, and are training from scratch"
"## If you dont have the model packs, and are training from scratch\n",
"<b>⚠️This is very rare, it is recommended to always use the model packs and then fine-tune them</b>"
]
},
{
Expand All @@ -167,23 +319,22 @@
"\n",
"tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
"\n",
"save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
"#Ideally this should replace the meta_models inside the modelpack\n",
"save_dir_path= \"test_meta_\" + meta_cat_task # Where to save the meta_model and results. \n",
"\n",
"# Initialise and train meta_model\n",
"mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
"results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
"results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
"\n",
"# Save results\n",
"json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
"json.dump(results['report'], open(os.path.join(save_dir_path,'meta_' + meta_cat_task+'_results.json'), 'w'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [conda env:cattrainer]",
"language": "python",
"name": "python3"
"name": "conda-env-cattrainer-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -195,7 +346,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.11.11"
}
},
"nbformat": 4,
Expand Down
Loading