Skip to content

Commit 302a458

Browse files
taylorfturnersuprabhatgurralajunholee6amainkhanmhmotamedi
authoredNov 13, 2023
Staging/main/0.10.6 (#1065)
* Add null ratio to column stats (#1052) * Delay transforming priority_order into ndarray (#1045) In the changed code, we had a mypy error because numpy ndarrays are not compatible with random.Random.shuffle() (expected argument type is MutableSequence[Any]) We fix this by first instantiating priority_order as a list, then shuffling it, then creating an ndarray from it afterwards. * Rename references to degree of freedom from df to deg_of_free (#1056) * change references to degrees of freedom in chi2 from df to deg_of_free * reformated using black pre-commit hook * add_s3_connection_remote_loading_s3uri_feature (#1054) * add_s3_connection_remote_loading_s3uri_feature * pre-commit fix * created S3Helper class and refactored data_utils and unit test * enhanced test_data.py with test_read_s3_uri * enhanced unit tests and refactored is_s3_uri * refactored some unit-tests structure * rename TestCreateS3Client to TestS3Helper * fix directions for contrib branch (#1059) * Feature: Plugins (#1060) * Reservoir sampling (#826) * add code for reservoir sampling and insert sample_nrows options * pre commit fix * add tests for reservoir sampling * fixed mypy issues * fix import to relative path --------- Co-authored-by: Taylor Turner <[email protected]> Co-authored-by: Richard Bann <[email protected]> * plugins loading + preset plugin fetching implementation (#911) * test * Plugin implementation * comments added to functions * plugin test implementation for plugin presets * forgot an import * added None catch * preset plugin test * removing stuff I forgot to delete * snake_case function names * relative path * relative path * made new file for plugin testing * forgot to delete function from old file * now ive fixed if statement * ok this should be it * Plugin testing (#947) * test * plugin test implementation for plugin presets * forgot an import * added None catch * preset plugin test * snake_case function names * relative path * relative path * forgot to delete function from old file * nothing yet, just want this in two different repos * new test for plugins feature and small update to plugin init * pass * didnt want dir to be overwritten * forgot a dir * fix isort pre-commit * reservoir sample * fix imports * fix testing * fix req to match dev --------- Co-authored-by: Rushabh Vinchhi <[email protected]> Co-authored-by: Richard Bann <[email protected]> Co-authored-by: Liz Smith <[email protected]> * version bump (#1064) * empty test --------- Co-authored-by: Suprabhat Gurrala <[email protected]> Co-authored-by: Junho Lee <[email protected]> Co-authored-by: Main Uddin Khan <[email protected]> Co-authored-by: Mohammad Motamedi <[email protected]> Co-authored-by: Rushabh Vinchhi <[email protected]> Co-authored-by: Richard Bann <[email protected]> Co-authored-by: Liz Smith <[email protected]>
1 parent 3ef1daa commit 302a458

27 files changed

+550
-48
lines changed
 

‎.github/CONTRIBUTING.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ For more nuanced testing runs, check out more detailed documentation [here](http
5353
## Creating [Pull Requests](https://github.com/capitalone/DataProfiler/pulls)
5454
Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
5555

56-
1. Fork the repo and create your branch from `main`.
56+
1. Fork the repo and create your branch from `dev`.
5757
2. If you've added code that should be tested, add tests.
5858
3. If you've changed APIs, update the documentation.
5959
4. Ensure the test suite passes.

‎.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ dataprofiler/labelers/embeddings/glove-reduced-64D.txt
1414

1515
.cache/
1616
.idea/
17-
.vscode
17+
.vscode*
1818
*.pyc
1919
*.pkl
2020
*.whl
@@ -134,3 +134,6 @@ venv.bak/
134134
env3/
135135

136136
*.bak
137+
138+
#Pipfiles
139+
Pipfile*

‎.pre-commit-config.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ repos:
6464
typing-extensions>=3.10.0.2,
6565
HLL>=2.0.3,
6666
datasketches>=4.1.0,
67+
boto3>=1.28.61,
6768

6869
# requirements-dev.txt
6970
check-manifest>=0.48,
@@ -110,7 +111,7 @@ repos:
110111
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas',
111112
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
112113
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
113-
'networkx','typing-extensions', 'HLL', 'datasketches']
114+
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
114115
# Pyupgrade - standardize and modernize Python syntax for newer versions of the language
115116
- repo: https://github.com/asottile/pyupgrade
116117
rev: v3.3.0

‎dataprofiler/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
UnstructuredDataLabeler,
99
train_structured_labeler,
1010
)
11+
from .plugins import load_plugins
1112
from .profilers.graph_profiler import GraphProfiler
1213
from .profilers.profile_builder import (
1314
Profiler,
@@ -41,3 +42,6 @@ def set_seed(seed=None):
4142
if seed is not None and (not isinstance(seed, int) or seed < 0):
4243
raise ValueError("Seed should be a non-negative integer.")
4344
settings._seed = seed
45+
46+
47+
load_plugins()

‎dataprofiler/data_readers/data.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .. import dp_logging
77
from .avro_data import AVROData
88
from .csv_data import CSVData
9-
from .data_utils import is_valid_url, url_to_bytes
9+
from .data_utils import S3Helper, is_valid_url, url_to_bytes
1010
from .graph_data import GraphData
1111
from .json_data import JSONData
1212
from .parquet_data import ParquetData
@@ -65,7 +65,14 @@ def __new__(
6565
options = dict()
6666

6767
if is_valid_url(input_file_path):
68-
input_file_path = url_to_bytes(input_file_path, options)
68+
if S3Helper.is_s3_uri(input_file_path, logger=logger):
69+
storage_options = options.pop("storage_options", {})
70+
s3 = S3Helper.create_s3_client(**storage_options)
71+
input_file_path = S3Helper.get_s3_uri(
72+
s3_uri=input_file_path, s3_client=s3
73+
)
74+
else:
75+
input_file_path = url_to_bytes(input_file_path, options)
6976

7077
for data_class_info in cls.data_classes:
7178
data_class = data_class_info["data_class"]

‎dataprofiler/data_readers/data_utils.py

+126
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Contains functions for data readers."""
22
import json
3+
import logging
4+
import os
35
import re
46
import urllib
57
from collections import OrderedDict
@@ -19,6 +21,8 @@
1921
cast,
2022
)
2123

24+
import boto3
25+
import botocore
2226
import dateutil
2327
import pandas as pd
2428
import pyarrow.parquet as pq
@@ -843,3 +847,125 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO:
843847

844848
stream.seek(0)
845849
return stream
850+
851+
852+
class S3Helper:
853+
"""
854+
A utility class for working with Amazon S3.
855+
856+
This class provides methods to check if a path is an S3 URI
857+
and to create an S3 client.
858+
"""
859+
860+
@staticmethod
861+
def is_s3_uri(path: str, logger: logging.Logger) -> bool:
862+
"""
863+
Check if the given path is an S3 URI.
864+
865+
This function checks for common S3 URI prefixes "s3://" and "s3a://".
866+
867+
Args:
868+
path (str): The path to check for an S3 URI.
869+
logger (logging.Logger): The logger instance for logging.
870+
871+
Returns:
872+
bool: True if the path is an S3 URI, False otherwise.
873+
"""
874+
# Define the S3 URI prefixes to check
875+
s3_uri_prefixes = ["s3://", "s3a://"]
876+
path = path.strip()
877+
# Check if the path starts with any of the specified prefixes
878+
is_s3 = any(path.startswith(prefix) for prefix in s3_uri_prefixes)
879+
if not is_s3:
880+
logger.debug(f"'{path}' is not a valid S3 URI")
881+
882+
return is_s3
883+
884+
@staticmethod
885+
def _create_boto3_client(
886+
aws_access_key_id: Optional[str],
887+
aws_secret_access_key: Optional[str],
888+
aws_session_token: Optional[str],
889+
region_name: Optional[str],
890+
) -> boto3.client:
891+
return boto3.client(
892+
"s3",
893+
aws_access_key_id=aws_access_key_id,
894+
aws_secret_access_key=aws_secret_access_key,
895+
aws_session_token=aws_session_token,
896+
region_name=region_name,
897+
)
898+
899+
@staticmethod
900+
def create_s3_client(
901+
aws_access_key_id: Optional[str] = None,
902+
aws_secret_access_key: Optional[str] = None,
903+
aws_session_token: Optional[str] = None,
904+
region_name: Optional[str] = None,
905+
) -> boto3.client:
906+
"""
907+
Create and return an S3 client.
908+
909+
Args:
910+
aws_access_key_id (str): The AWS access key ID.
911+
aws_secret_access_key (str): The AWS secret access key.
912+
aws_session_token (str): The AWS session token
913+
(optional, typically used for temporary credentials).
914+
region_name (str): The AWS region name (default is 'us-east-1').
915+
916+
Returns:
917+
boto3.client: A S3 client instance.
918+
"""
919+
# Check if credentials are not provided
920+
# and use environment variables as fallback
921+
if aws_access_key_id is None:
922+
aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
923+
if aws_secret_access_key is None:
924+
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
925+
if aws_session_token is None:
926+
aws_session_token = os.environ.get("AWS_SESSION_TOKEN")
927+
928+
# Check if region is not provided and use environment variable as fallback
929+
if region_name is None:
930+
region_name = os.environ.get("AWS_REGION", "us-east-1")
931+
932+
# Check if IAM roles for service accounts are available
933+
try:
934+
s3 = S3Helper._create_boto3_client(
935+
aws_access_key_id, aws_secret_access_key, aws_session_token, region_name
936+
)
937+
except botocore.exceptions.NoCredentialsError:
938+
# IAM roles are not available, so fall back to provided credentials
939+
if aws_access_key_id is None or aws_secret_access_key is None:
940+
raise ValueError(
941+
"AWS access key ID and secret access key are required."
942+
)
943+
s3 = S3Helper._create_boto3_client(
944+
aws_access_key_id, aws_secret_access_key, aws_session_token, region_name
945+
)
946+
947+
return s3
948+
949+
@staticmethod
950+
def get_s3_uri(s3_uri: str, s3_client: boto3.client) -> BytesIO:
951+
"""
952+
Download an object from an S3 URI and return its content as BytesIO.
953+
954+
Args:
955+
s3_uri (str): The S3 URI specifying the location of the object to download.
956+
s3_client (boto3.client): An initialized AWS S3 client
957+
for accessing the S3 service.
958+
959+
Returns:
960+
BytesIO: A BytesIO object containing the content of
961+
the downloaded S3 object.
962+
"""
963+
# Parse the S3 URI
964+
parsed_uri = urllib.parse.urlsplit(s3_uri)
965+
bucket_name = parsed_uri.netloc
966+
file_key = parsed_uri.path.lstrip("/")
967+
# Download the S3 object
968+
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
969+
970+
# Return the object's content as BytesIO
971+
return BytesIO(response["Body"].read())

‎dataprofiler/labelers/data_processing.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2047,9 +2047,9 @@ def process(
20472047
elif aggregation_func == "random":
20482048
num_labels = max(label_mapping.values()) + 1
20492049
random_state: random.Random = self._parameters["random_state"]
2050-
priority_order = np.array(list(range(num_labels)))
2051-
random_state.shuffle(priority_order) # type: ignore
2052-
self.priority_prediction(results, priority_order)
2050+
priority_order = list(range(num_labels))
2051+
random_state.shuffle(priority_order)
2052+
self.priority_prediction(results, np.array(priority_order))
20532053
else:
20542054
raise ValueError(
20552055
f"`{aggregation_func}` is not a valid aggregation function"

‎dataprofiler/plugins/__init__.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import importlib
2+
import os
3+
4+
from .decorators import plugin_decorator, plugins_dict
5+
6+
7+
def load_plugins():
8+
"""
9+
Digs through plugins folder for possible plugins to be imported
10+
and consequently added to the plugins_dict if properly decorated
11+
12+
:return: None
13+
"""
14+
plugin_path = os.path.dirname(os.path.abspath(__file__))
15+
for folder in os.listdir(plugin_path):
16+
option_path = os.path.join(plugin_path, folder)
17+
if os.path.isdir(option_path):
18+
if folder == "__pycache__":
19+
continue
20+
for filename in os.listdir(option_path):
21+
if filename is None or not filename.endswith(".py"):
22+
continue
23+
spec = importlib.util.spec_from_file_location(
24+
filename, os.path.join(option_path, filename)
25+
)
26+
if spec is not None:
27+
module = importlib.util.module_from_spec(spec)
28+
spec.loader.exec_module(module)
29+
30+
31+
def get_plugins(typ):
32+
"""
33+
Fetches a dictionary of plugins of a certain type
34+
35+
:param typ: Broader classification/type of a plugin
36+
:return: dict
37+
"""
38+
return plugins_dict.get(typ)

‎dataprofiler/plugins/decorators.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Contains function for generating plugins data."""
2+
from collections import defaultdict
3+
from typing import Any, DefaultDict, Dict
4+
5+
plugins_dict: DefaultDict[str, Dict[str, Any]] = defaultdict(dict)
6+
7+
8+
def plugin_decorator(typ, name):
9+
"""
10+
Populate plugins_dict with decorated plugin functions.
11+
12+
:param typ: Broader classification/type of a plugin
13+
:param name: Specific name of a plugin
14+
:return: function
15+
"""
16+
17+
def __inner_factory_function(fn):
18+
"""
19+
Actual population of plugin_dict.
20+
21+
:param fn: Plugin function
22+
:return: function
23+
"""
24+
global plugins_dict
25+
plugins_dict[typ][name] = fn
26+
return fn
27+
28+
return __inner_factory_function

‎dataprofiler/profilers/numerical_column_stats.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -611,8 +611,8 @@ def _perform_t_test(
611611
) -> dict:
612612
results: dict = {
613613
"t-statistic": None,
614-
"conservative": {"df": None, "p-value": None},
615-
"welch": {"df": None, "p-value": None},
614+
"conservative": {"deg_of_free": None, "p-value": None},
615+
"welch": {"deg_of_free": None, "p-value": None},
616616
}
617617

618618
invalid_stats = False
@@ -647,17 +647,17 @@ def _perform_t_test(
647647

648648
s_delta = var1 / n1 + var2 / n2
649649
t = (mean1 - mean2) / np.sqrt(s_delta)
650-
conservative_df = min(n1, n2) - 1
651-
welch_df = s_delta**2 / (
650+
conservative_deg_of_free = min(n1, n2) - 1
651+
welch_deg_of_free = s_delta**2 / (
652652
(var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1)
653653
)
654654
results["t-statistic"] = t
655-
results["conservative"]["df"] = float(conservative_df)
656-
results["welch"]["df"] = float(welch_df)
655+
results["conservative"]["deg_of_free"] = float(conservative_deg_of_free)
656+
results["welch"]["deg_of_free"] = float(welch_deg_of_free)
657657

658-
conservative_t = scipy.stats.t(conservative_df)
658+
conservative_t = scipy.stats.t(conservative_deg_of_free)
659659
conservative_p_val = (1 - conservative_t.cdf(abs(t))) * 2
660-
welch_t = scipy.stats.t(welch_df)
660+
welch_t = scipy.stats.t(welch_deg_of_free)
661661
welch_p_val = (1 - welch_t.cdf(abs(t))) * 2
662662

663663
results["conservative"]["p-value"] = float(conservative_p_val)

‎dataprofiler/profilers/profile_builder.py

+7
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def __init__(
9494
self.sample_size: int = 0
9595
self.sample: list[str] = list()
9696
self.null_count: int = 0
97+
self.null_ratio: float | None = None
9798
self.null_types: list[str] = list()
9899
self.null_types_index: dict = {}
99100
self._min_id: int | None = None
@@ -292,6 +293,9 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di
292293
"null_count": profiler_utils.find_diff_of_numbers(
293294
self.null_count, other_profile.null_count
294295
),
296+
"null_ratio": profiler_utils.find_diff_of_numbers(
297+
self.null_ratio, other_profile.null_ratio
298+
),
295299
"null_types": profiler_utils.find_diff_of_lists_and_sets(
296300
self.null_types, other_profile.null_types
297301
),
@@ -428,6 +432,7 @@ def _update_base_stats(self, base_stats: dict) -> None:
428432
self._last_batch_size = base_stats["sample_size"]
429433
self.sample = base_stats["sample"]
430434
self.null_count += base_stats["null_count"]
435+
self.null_ratio = base_stats["null_count"] / base_stats["sample_size"]
431436
self.null_types = profiler_utils._combine_unique_sets(
432437
self.null_types, list(base_stats["null_types"].keys())
433438
)
@@ -570,6 +575,7 @@ def clean_data_and_get_base_stats(
570575
{
571576
"sample_size": 0,
572577
"null_count": 0,
578+
"null_ratio": None,
573579
"null_types": dict(),
574580
"sample": [],
575581
"min_id": None,
@@ -658,6 +664,7 @@ def clean_data_and_get_base_stats(
658664
base_stats = {
659665
"sample_size": total_sample_size,
660666
"null_count": total_na,
667+
"null_ratio": total_na / total_sample_size,
661668
"null_types": na_columns,
662669
"sample": rng.choice(
663670
list(df_series.values), (min(len(df_series), 5),), replace=False

0 commit comments

Comments
 (0)
Please sign in to comment.