Issue #62: added 'script' attribute to 'system' stanza in JSON export file format

amesar · amesar · commit 8ce46e988222 · 2022-12-14T21:13:35.000-05:00
diff --git a/mlflow_export_import/bulk/export_all.py b/mlflow_export_import/bulk/export_all.py
@@ -32,14 +32,14 @@ def export_all(output_dir, notebook_formats=None, use_threads=False):
         use_threads=use_threads)
     duration = round(time.time() - start_time, 1)
 
-    content = {
+    mlflow_attr = {
         "summary": {
             "stages": ALL_STAGES,
             "notebook_formats": notebook_formats,
             "duration": duration
         }
     }
-    io_utils.write_export_file(output_dir, "all_manifest.json", content)
+    io_utils.write_export_file(output_dir, "all_manifest.json", __file__, mlflow_attr)
     print(f"Duraton for entire tracking server export: {duration} seconds")
 
 
diff --git a/mlflow_export_import/bulk/export_experiments.py b/mlflow_export_import/bulk/export_experiments.py
@@ -85,15 +85,15 @@ def export_experiments(client, experiments, output_dir, notebook_formats=None, u
     total_runs = ok_runs + failed_runs
     duration = round(time.time() - start_time, 1)
 
-    custom_info = {
+    info_attr = {
       "duration": duration,
       "experiments": len(experiments),
       "total_runs": total_runs,
       "ok_runs": ok_runs,
       "failed_runs": failed_runs
     }
-    content = { "experiments": export_results }
-    io_utils.write_export_file(output_dir, "experiments.json", content, custom_info)
+    mlflow_attr = { "experiments": export_results }
+    io_utils.write_export_file(output_dir, "experiments.json", __file__, mlflow_attr, info_attr)
 
     print(f"{len(experiments)} experiments exported")
     print(f"{ok_runs}/{total_runs} runs succesfully exported")
diff --git a/mlflow_export_import/bulk/export_models.py b/mlflow_export_import/bulk/export_models.py
@@ -41,7 +41,7 @@ def _export_models(client, model_names, output_dir, notebook_formats, stages, ex
         else: failed_models.append(result[1])
     duration = round(time.time()-start_time, 1)
 
-    custom_info = {
+    info_attr = {
         "stages": stages,
         "notebook_formats": notebook_formats,
         "num_total_models": len(model_names),
@@ -50,10 +50,10 @@ def _export_models(client, model_names, output_dir, notebook_formats, stages, ex
         "duration": duration,
         "failed_models": failed_models
     }
-    content = {
+    mlflow_attr = {
         "models": ok_models,
     }
-    io_utils.write_export_file(output_dir, "models.json", content, custom_info)
+    io_utils.write_export_file(output_dir, "models.json", __file__, mlflow_attr, info_attr)
 
     print(f"{len(model_names)} models exported")
     print(f"Duration for registered models export: {duration} seconds")
@@ -69,11 +69,11 @@ def export_models(client, model_names, output_dir, notebook_formats=None, stages
     _export_models(client, model_names, os.path.join(output_dir,"models"), notebook_formats, stages, export_run=False, use_threads=use_threads)
     duration = round(time.time()-start_time, 1)
 
-    custom_info = {
+    info_attr = {
       "stages": stages, 
       "notebook_formats": notebook_formats
     }
-    io_utils.write_export_file(output_dir, "models.json", {}, custom_info)
+    io_utils.write_export_file(output_dir, "models.json", __file__, {}, info_attr)
 
     print(f"Duration for total registered models and versions' runs export: {duration} seconds")
 
diff --git a/mlflow_export_import/common/io_utils.py b/mlflow_export_import/common/io_utils.py
@@ -6,14 +6,15 @@
 from mlflow_export_import.common.source_tags import ExportFields
 
 
-def _mk_export_info():
+def _mk_system_attr(script):
     """
     Create common standard JSON stanza containing internal export information.
     """
     import mlflow
     import platform
     return {
         ExportFields.SYSTEM: {
+            "script": os.path.basename(script),
             "export_time": ts_now_seconds,
             "_export_time": ts_now_fmt_utc,
             "mlflow_version": mlflow.__version__,
@@ -27,16 +28,16 @@ def _mk_export_info():
     }
 
 
-def write_export_file(dir, file, content, custom_info=None):
+def write_export_file(dir, file, script, mlflow_attr, info_attr=None):
     """
     Write standard formatted JSON file.
     """
     path = os.path.join(dir, file)
-    custom_info = { ExportFields.INFO: custom_info} if custom_info else {}
-    content = { ExportFields.MLFLOW: content}
-    content = { **_mk_export_info(), **custom_info, **content }
+    info_attr = { ExportFields.INFO: info_attr} if info_attr else {}
+    mlflow_attr = { ExportFields.MLFLOW: mlflow_attr}
+    mlflow_attr = { **_mk_system_attr(script), **info_attr, **mlflow_attr }
     os.makedirs(dir, exist_ok=True)
-    write_file(path, content)
+    write_file(path, mlflow_attr)
 
 
 def write_file(path, content):
@@ -64,7 +65,7 @@ def read_file(path):
             return json.loads(f.read())
 
 
-def get_custom(export_dct):
+def get_info(export_dct):
     return export_dct[ExportFields.INFO]
 
 
diff --git a/mlflow_export_import/common/source_tags.py b/mlflow_export_import/common/source_tags.py
@@ -8,7 +8,6 @@ class ExportFields:
 
 class ExportTags:
     """ Tags source export tags. """
-    #PREFIX_ROOT    = "mlflow_export_import"
     PREFIX_ROOT    = "mlflow_exim"
     PREFIX_RUN_INFO = f"{PREFIX_ROOT}.run_info"
     PREFIX_MLFLOW = f"{PREFIX_ROOT}.mlflow"
diff --git a/mlflow_export_import/experiment/export_experiment.py b/mlflow_export_import/experiment/export_experiment.py
@@ -44,7 +44,7 @@ def export_experiment(self, exp_id_or_name, output_dir, run_ids=None):
             for j,run in enumerate(SearchRunsIterator(self.mlflow_client, exp.experiment_id)):
                 self._export_run(j, run, output_dir, ok_run_ids, failed_run_ids)
 
-        custom_info = {
+        info_attr = {
             "num_total_runs": (j+1),
             "num_ok_runs": len(ok_run_ids),
             "ok_runs": ok_run_ids,
@@ -53,9 +53,9 @@ def export_experiment(self, exp_id_or_name, output_dir, run_ids=None):
         }
         exp_dct = utils.strip_underscores(exp) 
         exp_dct["tags"] = dict(sorted(exp_dct["tags"].items()))
-        content = { "experiment": exp_dct }
 
-        io_utils.write_export_file(output_dir, "experiment.json", content, custom_info)
+        mlflow_attr = { "experiment": exp_dct }
+        io_utils.write_export_file(output_dir, "experiment.json", __file__, mlflow_attr, info_attr)
 
         msg = f"for experiment '{exp.name}' (ID: {exp.experiment_id})"
         if len(failed_run_ids) == 0:
diff --git a/mlflow_export_import/experiment/import_experiment.py b/mlflow_export_import/experiment/import_experiment.py
@@ -11,7 +11,7 @@
 from mlflow_export_import.common import mlflow_utils
 from mlflow_export_import.common.http_client import DatabricksHttpClient
 from mlflow_export_import.run.import_run import RunImporter
-from mlflow_export_import.common.source_tags import ExportFields, ExportTags
+from mlflow_export_import.common.source_tags import ExportTags
 
 
 def _peek_at_experiments(exp_dir):
@@ -48,7 +48,7 @@ def import_experiment(self, exp_name, input_dir, dst_notebook_dir=None):
 
         path = io_utils.mk_manifest_json_path(input_dir, "experiment.json")
         exp_dct = io_utils.read_file(path)
-        custom_info = io_utils.get_custom(exp_dct)
+        info = io_utils.get_info(exp_dct)
         exp_dct = io_utils.get_mlflow(exp_dct)
 
         tags = exp_dct["experiment"]["tags"] 
@@ -58,8 +58,8 @@ def import_experiment(self, exp_name, input_dir, dst_notebook_dir=None):
 
         mlflow_utils.set_experiment(self.mlflow_client, self.dbx_client, exp_name, tags)
 
-        run_ids = custom_info["ok_runs"]
-        failed_run_ids = custom_info["failed_runs"]
+        run_ids = info["ok_runs"]
+        failed_run_ids = info["failed_runs"]
 
         print(f"Importing {len(run_ids)} runs into experiment '{exp_name}' from {input_dir}")
         run_ids_map = {}
diff --git a/mlflow_export_import/model/export_model.py b/mlflow_export_import/model/export_model.py
@@ -17,20 +17,19 @@ class ModelExporter():
 
     def __init__(self,  mlflow_client, notebook_formats=None, stages=None, versions=None, export_run=True):
         """
-        :param mlflow_client: MLflow client or if None create default client.
+        :param mlflow_client: MlflowClient
         :param notebook_formats: List of notebook formats to export. Values are SOURCE, HTML, JUPYTER or DBC.
         :param stages: Stages to export. Default is all stages. Values are Production, Staging, Archived and None.
         :param export_run: Export the run that generated a registered model's version.
         """
         self.mlflow_client = mlflow_client
         self.http_client = MlflowHttpClient()
         self.run_exporter = RunExporter(self.mlflow_client, notebook_formats=notebook_formats)
-        self.stages = self._normalize_stages(stages)
         self.export_run = export_run
+        self.stages = self._normalize_stages(stages)
         self.versions = versions if versions else []
         if len(self.stages) > 0 and len(self.versions) > 0:
             raise MlflowExportImportException(f"Both stages {self.stages} and versions {self.versions} cannot be set")
-        self.export_run = export_run
 
 
     def export_model(self, model_name, output_dir):
@@ -62,7 +61,7 @@ def _export_model(self, model_name, output_dir):
             opath = os.path.join(output_dir,run_id)
             opath = opath.replace("dbfs:", "/dbfs")
             dct = { "version": vr.version, "stage": vr.current_stage, "run_id": run_id, "description": vr.description, "tags": vr.tags }
-            print(f"Exporting version: {dct}")
+            print(f"Exporting verions {vr.version} to '{opath}'")
             manifest.append(dct)
             try:
                 if self.export_run:
@@ -85,13 +84,13 @@ def _export_model(self, model_name, output_dir):
         model = self.http_client.get(f"registered-models/get", {"name": model_name})
         model["registered_model"]["latest_versions"] = output_versions
 
-        custom_info = {
+        info_attr = {
             "num_target_stages": len(self.stages),
             "num_target_versions": len(self.versions),
             "num_src_versions": len(versions),
             "num_dst_versions": len(output_versions)
         }
-        io_utils.write_export_file(output_dir, "model.json", model, custom_info)
+        io_utils.write_export_file(output_dir, "model.json", __file__, model, info_attr)
 
         print(f"Exported {exported_versions}/{len(output_versions)} versions for model '{model_name}'")
         return manifest
diff --git a/mlflow_export_import/run/export_run.py b/mlflow_export_import/run/export_run.py
@@ -53,13 +53,13 @@ def export_run(self, run_id, output_dir):
         tags = run.data.tags
         tags = dict(sorted(tags.items()))
         
-        content = {
+        mlflow_attr = {
             "info": utils.strip_underscores(run.info),
             "params": run.data.params,
             "metrics": self._get_metrics_with_steps(run),
             "tags": tags
         }
-        io_utils.write_export_file(output_dir, "run.json", content)
+        io_utils.write_export_file(output_dir, "run.json", __file__, mlflow_attr)
         fs =  _filesystem.get_filesystem(".")
 
         # copy artifacts

Original file line number	Diff line number	Diff line change
`@@ -32,14 +32,14 @@ def export_all(output_dir, notebook_formats=None, use_threads=False):`
`32`	`32`	`use_threads=use_threads)`
`33`	`33`	`duration = round(time.time() - start_time, 1)`
`34`	`34`
`35`		`- content = {`
	`35`	`+ mlflow_attr = {`
`36`	`36`	`"summary": {`
`37`	`37`	`"stages": ALL_STAGES,`
`38`	`38`	`"notebook_formats": notebook_formats,`
`39`	`39`	`"duration": duration`
`40`	`40`	`}`
`41`	`41`	`}`
`42`		`- io_utils.write_export_file(output_dir, "all_manifest.json", content)`
	`42`	`+ io_utils.write_export_file(output_dir, "all_manifest.json", __file__, mlflow_attr)`
`43`	`43`	`print(f"Duraton for entire tracking server export: {duration} seconds")`
`44`	`44`
`45`	`45`