Merge pull request #13 from Clearbox-AI/develop

dariobrunelli · web-flow · commit 32eba99497e3 · 2025-05-09T14:49:52.000+02:00
Develop in main
diff --git a/README.md b/README.md
@@ -80,14 +80,16 @@ Below is a code snippet example for the usage of the library:
 # Import the necessary modules from the SURE library
 from sure import Preprocessor, report
 from sure.utility import (compute_statistical_metrics, compute_mutual_info,
-			  compute_utility_metrics_class)
+			  compute_utility_metrics_class,
+			  detection,
+			  query_power)
 from sure.privacy import (distance_to_closest_record, dcr_stats, number_of_dcr_equal_to_zero, validation_dcr_test, 
 			  adversary_dataset, membership_inference_test)
 
 # Assuming real_data, valid_data and synth_data are three pandas DataFrames
 
 # Preprocessor initialization and query execution on the real, synthetic and validation datasets
-preprocessor            = Preprocessor(real_data, get_discarded_info=False, num_fill_null='forward', scaling='standardize')
+preprocessor            = Preprocessor(real_data, num_fill_null='forward', scaling='standardize')
 
 real_data_preprocessed  = preprocessor.transform(real_data)
 valid_data_preprocessed = preprocessor.transform(valid_data)
@@ -115,6 +117,12 @@ dcr_zero_synth_train  = number_of_dcr_equal_to_zero("synth_train", dcr_synth_tra
 dcr_zero_synth_valid  = number_of_dcr_equal_to_zero("synth_val", dcr_synth_valid)
 share                 = validation_dcr_test(dcr_synth_train, dcr_synth_valid)
 
+# Detection Score
+detection_score = detection(real_data, synth_data, preprocessor)
+
+# Query Power
+query_power_score = query_power(real_data, synth_data, preprocessor)
+
 # ML privacy attack sandbox initialization and simulation
 adversary_df = adversary_dataset(real_data_preprocessed, valid_data_preprocessed)
 # The function adversary_dataset adds a column "privacy_test_is_training" to the adversary dataset, indicating whether the record was part of the training set or not
diff --git a/docs/source/doc_2.md b/docs/source/doc_2.md
@@ -23,23 +23,19 @@ Follow the step-by-step guide to test the library using the provided [instructio
 # Import the necessary modules from the SURE library
 from sure import Preprocessor, report
 from sure.utility import (compute_statistical_metrics, compute_mutual_info,
-			  compute_utility_metrics_class)
+			  compute_utility_metrics_class,
+			  detection,
+			  query_power)
 from sure.privacy import (distance_to_closest_record, dcr_stats, number_of_dcr_equal_to_zero, validation_dcr_test, 
 			  adversary_dataset, membership_inference_test)
 
 # Assuming real_data, valid_data and synth_data are three pandas DataFrames
 
-# Real dataset - Preprocessor initialization and query exacution
-preprocessor            = Preprocessor(real_data, get_discarded_info=False)
-real_data_preprocessed  = preprocessor.transform(real_data, num_fill_null='forward', scaling='standardize')
-
-# Validation dataset - Preprocessor initialization and query exacution
-preprocessor            = Preprocessor(valid_data, get_discarded_info=False)
-valid_data_preprocessed = preprocessor.transform(valid_data, num_fill_null='forward', scaling='standardize')
-
-# Synthetic dataset - Preprocessor initialization and query exacution
-preprocessor            = Preprocessor(synth_data, get_discarded_info=False)
-synth_data_preprocessed = preprocessor.transform(synth_data, num_fill_null='forward', scaling='standardize')
+# Preprocessor initialization and query execution on the real, synthetic and validation datasets
+preprocessor            = Preprocessor(real_data)
+real_data_preprocessed  = preprocessor.transform(real_data)
+valid_data_preprocessed = preprocessor.transform(valid_data)
+synth_data_preprocessed = preprocessor.transform(synth_data)
 
 # Statistical properties and mutual information
 num_features_stats, cat_features_stats, temporal_feat_stats = compute_statistical_metrics(real_data, synth_data)
@@ -63,6 +59,12 @@ dcr_zero_synth_train  = number_of_dcr_equal_to_zero("synth_train", dcr_synth_tra
 dcr_zero_synth_valid  = number_of_dcr_equal_to_zero("synth_val", dcr_synth_valid)
 share                 = validation_dcr_test(dcr_synth_train, dcr_synth_valid)
 
+# Detection Score
+detection_score = detection(real_data, synth_data, preprocessor)
+
+# Query Power
+query_power_score = query_power(real_data, synth_data, preprocessor)
+
 # ML privacy attack sandbox initialization and simulation
 adversary_df = adversary_dataset(real_data_preprocessed, valid_data_preprocessed)
 # The function adversary_dataset adds a column "privacy_test_is_training" to the adversary dataset, indicating whether the record was part of the training set or not
diff --git a/examples/sure_test.ipynb b/examples/sure_test.ipynb
@@ -64,9 +64,11 @@
     "\n",
     "from sure import Preprocessor, report\n",
     "from sure.utility import (compute_statistical_metrics, compute_mutual_info,\n",
-    "\t\t\t  compute_utility_metrics_class)\n",
+    "\t\t\t  \t\t\t  compute_utility_metrics_class,\n",
+    "\t\t\t\t\t\t  detection,\n",
+    "\t\t\t\t\t\t  query_power)\n",
     "from sure.privacy import (distance_to_closest_record, dcr_stats, number_of_dcr_equal_to_zero, validation_dcr_test, \n",
-    "\t\t\t  adversary_dataset, membership_inference_test)"
+    "\t\t\t              adversary_dataset, membership_inference_test)"
    ]
   },
   {
@@ -111,7 +113,7 @@
    "outputs": [],
    "source": [
     "# Preprocessor initialization and query execution on the real, synthetic and validation datasets\n",
-    "preprocessor            = Preprocessor(real_data, get_discarded_info=False, num_fill_null='forward', scaling='standardize')\n",
+    "preprocessor            = Preprocessor(real_data, num_fill_null='forward', scaling='standardize')\n",
     "\n",
     "real_data_preprocessed  = preprocessor.transform(real_data)\n",
     "valid_data_preprocessed = preprocessor.transform(valid_data)\n",
@@ -129,7 +131,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### 2.1 Statistical properties and mutual information"
+    "#### 2.1 Statistical properties and mutual information\n",
+    "These functions compute general statistical features, the correlation matrices and the difference between the correlation matrix of the real and synthetic dataset."
    ]
   },
   {
@@ -147,7 +150,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### 2.2 ML utility - Train on Synthetic Test on Real"
+    "#### 2.2 ML utility - Train on Synthetic Test on Real\n",
+    "The `compute_utility_metrics_class` trains multiple machine learning classification models on the synthetic dataset and evaluates their performance on the validation set.\n",
+    "\n",
+    "For comparison, it also trains the same models on the original training set and evaluates them on the same validation set. This allows a direct comparison between models trained on synthetic data and those trained on real data."
    ]
   },
   {
@@ -168,6 +174,63 @@
     "TSTR_metrics = compute_utility_metrics_class(X_train, X_synth, X_test, y_train, y_synth, y_test)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.3 Detection Score\n",
+    "Computes the detection score by training an XGBoost model to differentiate between original and synthetic data. \n",
+    "\n",
+    "The lower the model's accuracy, the higher the quality of the synthetic data.\n",
+    "\n",
+    "\n",
+    "The detection score is computed as\n",
+    "\n",
+    "detection_score = 2*(1 - ROC_AUC)\n",
+    "\n",
+    "So if ROC_AUC<=0.5 the synthetic dataset is considered indistinguishable from the real dataset (detection score =1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "detection_score = detection(real_data, synth_data, preprocessor)\n",
+    "print(\"Detection accuracy: \", detection_score[\"accuracy\"])\n",
+    "print(\"Detection ROC_AUC: \", detection_score[\"ROC_AUC\"])\n",
+    "print(\"Detection score: \", detection_score[\"score\"])\n",
+    "print(\"Detection feature importances: \", detection_score[\"feature_importances\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Query Power\n",
+    "Generates and runs queries to compare the original and synthetic datasets.\n",
+    "\n",
+    "This method creates random queries that filter data from both datasets.\n",
+    "\n",
+    "The similarity between the sizes of the filtered results is used to score the quality of the synthetic data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_power_score = query_power(real_data, synth_data, preprocessor)\n",
+    "\n",
+    "print(\"Query Power score: \", query_power_score[\"score\"])\n",
+    "for query in query_power_score[\"queries\"]:\n",
+    "    print(\"\\n\", query[\"text\"])\n",
+    "    print(\"Query result on real: \", query[\"original_df\"])\n",
+    "    print(\"Query result on synthetic: \", query[\"synthetic_df\"])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -278,9 +341,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "projects",
+   "display_name": "Python (test_sure)",
    "language": "python",
-   "name": "python3"
+   "name": "test_sure"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/sure/distance_metrics/distance.py b/sure/distance_metrics/distance.py
@@ -85,7 +85,7 @@ def distance_to_closest_record(
                         y_dataframe: pd.DataFrame | pl.DataFrame | pl.LazyFrame = None,
                         feature_weights: np.ndarray | List = None,
                         parallel: bool = True,
-                        save_output: bool = True,
+                        save_data: bool = True,
                         path_to_json: str = ""
                     ) -> np.ndarray:
     """
@@ -120,8 +120,10 @@ def distance_to_closest_record(
         If None, each feature weight is 1.0
     parallel : Boolean, optional
         Whether to enable the parallelization to compute Gower matrix, by default True
-    save_output : bool
+    save_data : bool
         If True, saves the DCR information into the JSON file used to generate the final report.
+    path_to_json : str
+        Path to the JSON file used to generate the final report.
 
     Returns
     -------
@@ -263,12 +265,13 @@ def distance_to_closest_record(
                     weight_sum,
                     fill_diagonal,
                 )
-    if save_output:
+    if save_data:
         _save_to_json("dcr_"+dcr_name, dcr, path_to_json)
     return dcr
     
 def dcr_stats(dcr_name: str, 
               distances_to_closest_record: np.ndarray,
+              save_data: bool = True,
               path_to_json: str = "") -> Dict:
     """
     This function returns the statisitcs for an array containing DCR computed previously.
@@ -284,6 +287,10 @@ def dcr_stats(dcr_name: str,
     distances_to_closest_record : np.ndarray
         A 1D-array containing the Distance to the Closest Record for each row of a dataframe
         shape (dataframe rows, )
+    save_data : bool
+        If True, saves the DCR information into the JSON file used to generate the final report.
+    path_to_json : str
+        Path to the JSON file used to generate the final report.
 
     Returns
     -------
@@ -303,11 +310,13 @@ def dcr_stats(dcr_name: str,
         "75%": dcr_percentiles[3].item(),
         "max": dcr_percentiles[4].item(),
     }
-    _save_to_json("dcr_"+dcr_name+"_stats", dcr_stats, path_to_json)
+    if save_data:
+        _save_to_json("dcr_"+dcr_name+"_stats", dcr_stats, path_to_json)
     return dcr_stats
 
 def number_of_dcr_equal_to_zero(dcr_name: str, 
                                 distances_to_closest_record: np.ndarray,
+                                save_data: bool = True,
                                 path_to_json: str = "") -> int_type:
     """
     Return the number of 0s in the given DCR array, that is the number of duplicates/clones detected.
@@ -317,6 +326,10 @@ def number_of_dcr_equal_to_zero(dcr_name: str,
     distances_to_closest_record : np.ndarray
         A 1D-array containing the Distance to the Closest Record for each row of a dataframe
         shape (dataframe rows, )
+    save_data : bool
+        If True, saves the DCR information into the JSON file used to generate the final report.
+    path_to_json : str
+        Path to the JSON file used to generate the final report.
 
     Returns
     -------
@@ -327,14 +340,16 @@ def number_of_dcr_equal_to_zero(dcr_name: str,
         raise TypeError("dcr_name must be one of the following:\n    -\"synth_train\"\n    -\"synth_val\"\n    -\"other\"")
 
     zero_values_mask = distances_to_closest_record == 0.0
-    _save_to_json("dcr_"+dcr_name+"_num_of_zeros", zero_values_mask.sum(), path_to_json)
+    if save_data:
+        _save_to_json("dcr_"+dcr_name+"_num_of_zeros", zero_values_mask.sum(), path_to_json)
     return zero_values_mask.sum()
 
 # def dcr_histogram(
 #             dcr_name: str,
 #             distances_to_closest_record: np.ndarray, 
 #             bins: int = 20, 
 #             scale_to_100: bool = True,
+#             save_data: bool = True,
 #             path_to_json: str = ""
 #         ) -> Dict:
 #     """
@@ -394,12 +409,14 @@ def number_of_dcr_equal_to_zero(dcr_name: str,
 #         "bins_edge_without_zero": bins_without_zero.tolist(),
 #     }
     
-#     _save_to_json("dcr_"+dcr_name+"_hist", dcr_hist, path_to_json)
+#        if save_data:
+#        _save_to_json("dcr_"+dcr_name+"_hist", dcr_hist, path_to_json)
 #     return dcr_hist
 
 def validation_dcr_test(
                 dcr_synth_train: np.ndarray, 
                 dcr_synth_validation: np.ndarray,
+                save_data: bool = True,
                 path_to_json: str = ""
             ) -> float_type:
     """
@@ -416,6 +433,10 @@ def validation_dcr_test(
     dcr_synth_validation : np.ndarray
         A 1D-array containing the Distance to the Closest Record for each row of the synthetic
         dataset wrt the validation dataset, shape (synthetic rows, )
+    save_data : bool
+        If True, saves the DCR information into the JSON file used to generate the final report.
+    path_to_json : str
+        Path to the JSON file used to generate the final report.
 
     Returns
     -------
@@ -455,5 +476,6 @@ def validation_dcr_test(
         percentage = synth_dcr_smaller_than_holdout_dcr_sum / number_of_rows * 100
     
     dcr_validation = {"percentage": round(percentage,4), "warnings": warnings}
-    _save_to_json("dcr_validation", dcr_validation, path_to_json)
+    if save_data:
+        _save_to_json("dcr_validation", dcr_validation, path_to_json)
     return dcr_validation
diff --git a/sure/privacy/privacy.py b/sure/privacy/privacy.py
@@ -90,6 +90,7 @@ def membership_inference_test(
     synthetic_dataset:  pd.DataFrame | pl.DataFrame | pl.LazyFrame,
     adversary_guesses_ground_truth: np.ndarray | pd.DataFrame | pl.DataFrame | pl.LazyFrame | pl.Series,
     parallel: bool = True,
+    save_data = True,
     path_to_json: str = ""
 ):
     """
@@ -105,6 +106,8 @@ def membership_inference_test(
         Ground truth labels indicating whether a sample is from the original training dataset or not.
     parallel : bool, optional
         Whether to use parallel processing for distance calculations, by default True.
+    save_data : bool
+        If True, saves the DCR information into the JSON file used to generate the final report, by default True.
     path_to_json : str, optional
         Path to save the attack output as a JSON file. If empty, the output is not saved, by default "".
 
@@ -157,5 +160,6 @@ def membership_inference_test(
         "membership_inference_mean_risk_score": membership_inference_mean_risk_score,
     }
 
-    _save_to_json("MIA_attack", attack_output, path_to_json)
+    if save_data:
+        _save_to_json("MIA_attack", attack_output, path_to_json)
     return attack_output
diff --git a/sure/utility.py b/sure/utility.py