From 41bed68940afd0e6941cfb1bae123ce9d3f2099a Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 8 Nov 2024 08:34:25 -0500 Subject: [PATCH 01/24] first implementation of web2parquet for crawling/downloading from seedURLs Signed-off-by: Maroun Touma --- .../web2parquet/dpk_web2parquet/transform.py | 103 ++++++++++++++++++ .../web2parquet/dpk_web2parquet/utils.py | 21 ++++ 2 files changed, 124 insertions(+) create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/transform.py create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/utils.py diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py new file mode 100644 index 000000000..e5a16e2a7 --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -0,0 +1,103 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa +from data_processing.transform import AbstractTableTransform +from dpk_web2parquet.utils import get_file_info + +from dpk_connector import crawl, shutdown + +user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" + +logger = get_logger(__name__) + +class Web2ParquetTransform(AbstractTableTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, **kwargs): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of NOOPTransformConfiguration class + super().__init__(kwargs) + self.seed_urls = kwargs.get("urls", []) + self.depth = kwargs.get("depth", 1) + self.downloads = kwargs.get("downloads", 10) + self.allow_mime_types = kwargs.get("mime_types", ["application/pdf","text/html","text/markdown","text/plain"]) + + assert self.seed_urls.length, "Must specify a URL to crawl. Url cannot be None" + + self.count = 0 + self.docs = [] + # create a data access object for storing files locally + self.dao = None + + def on_download(self, url: str, body: bytes, headers: dict) -> None: + """ + Callback function called when a page has been downloaded. + You have access to the request URL, response body and headers. + """ + logger.debug(f"url: {url}, headers: {headers}, body: {body[:64]}") + self.count += 1 + file_info = parse_headers(headers=headers, url=url) + doc = headers + doc['url'] = url + doc['filename'], doc['content_type'] = get_file_info(headers) + doc['content'] = body + self.docs.append(doc) + try: + if self.dao: + self.dao.sav() + except: + pass + + def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + start_time = time.time() + crawl( + self.seed_urls, + self.on_download, + user_agent=user_agent, + depth_limit=self.depth, + allow_mime_types=self.allow_mime_types + ) # blocking call + # Shutdown all crawls + shutdown() + + end_time = time.time() + table = pa.Table.from_pylist(self.docs) + metadata = { + "count": self.count, + "start time": start_time, + "end time": end_time + } + logger.info(f"Crawling is completed in {end_time - start_time:.2f} seconds") + logger.info(f"{metadata = }") + return [table], metadata + + + diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py new file mode 100644 index 000000000..728274265 --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -0,0 +1,21 @@ +from datetime import datetime + +def get_file_info(headers, url=None): + # Extract file size + file_size = int(headers.get('Content-Length', 0)) # Default to 0 if not found + + # Extract filename from Content-Disposition + try: + filename = headers.get('Content-Disposition').split('filename=')[-1].strip().strip('"') + except: + filename = None + + # try to find the file name from + if not filename: + try: + parts = url.split('/') + filename = parts[-1] + except: + filename= url + return filename, headers.get('Content-Type') + From cf516b5f00eb29cb726926f34339703eb7f9fe9e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 11 Nov 2024 15:22:47 -0500 Subject: [PATCH 02/24] use makefile template Signed-off-by: Maroun Touma --- transforms/.make.modules | 61 +++++++++++++++++++ transforms/universal/web2parquet/Makefile | 23 +++++++ .../web2parquet/dpk_web2parquet/transform.py | 14 ++--- 3 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 transforms/.make.modules create mode 100644 transforms/universal/web2parquet/Makefile diff --git a/transforms/.make.modules b/transforms/.make.modules new file mode 100644 index 000000000..9a7c80c98 --- /dev/null +++ b/transforms/.make.modules @@ -0,0 +1,61 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +###################################################################### +## Default setting for TRANSFORM_RUNTIME uses folder name-- Old layout +TRANSFORM_RUNTIME=ray +TRANSFORM_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).$(TRANSFORM_RUNTIME).transform + +venv:: .transforms.ray-venv + +test:: .transforms.test-src test-image + +clean:: .transforms.clean + +#image:: .transforms.ray-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +publish:: publish-image + +publish-image:: .transforms.publish-image-ray + +test-image:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean + +set-versions:: + +build-lib-wheel: + make -C $(REPOROOT)/data-processing-lib build-pkg-dist + +image:: build-lib-wheel + @$(eval LIB_WHEEL_FILE := $(shell find $(REPOROOT)/data-processing-lib/dist/*.whl)) + rm -fr dist && mv $(REPOROOT)/data-processing-lib/dist . + $(eval WHEEL_FILE_NAME := $(shell basename $(LIB_WHEEL_FILE))) + $(DOCKER) build -t $(DOCKER_IMAGE_NAME) $(DOCKER_BUILD_EXTRA_ARGS) \ + --platform $(DOCKER_PLATFORM) \ + --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ + --build-arg BASE_IMAGE=$(RAY_BASE_IMAGE) \ + --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ + --build-arg WHEEL_FILE_NAME=$(WHEEL_FILE_NAME) \ + --build-arg TRANSFORM_NAME=$(TRANSFORM_NAME) \ + --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . + $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) + rm -fr dist + +publish:: publish-image + + diff --git a/transforms/universal/web2parquet/Makefile b/transforms/universal/web2parquet/Makefile new file mode 100644 index 000000000..8978c5d8b --- /dev/null +++ b/transforms/universal/web2parquet/Makefile @@ -0,0 +1,23 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/transforms/.make.modules + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +#TRANSFORM_VERSION=$(DPK_VERSION) + + diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py index e5a16e2a7..6c643f074 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/transform.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -38,20 +38,20 @@ def __init__(self, **kwargs): """ # Make sure that the param name corresponds to the name used in apply_input_params method # of NOOPTransformConfiguration class - super().__init__(kwargs) + super().__init__(dict(kwargs)) self.seed_urls = kwargs.get("urls", []) self.depth = kwargs.get("depth", 1) self.downloads = kwargs.get("downloads", 10) self.allow_mime_types = kwargs.get("mime_types", ["application/pdf","text/html","text/markdown","text/plain"]) - + self.output_folder=kwargs.get('putput_folder', None) assert self.seed_urls.length, "Must specify a URL to crawl. Url cannot be None" self.count = 0 self.docs = [] - # create a data access object for storing files locally + # create a data access object for storing files self.dao = None - def on_download(self, url: str, body: bytes, headers: dict) -> None: + def on_download(self, url: str, body: bytes, headers: dict) -> None: """ Callback function called when a page has been downloaded. You have access to the request URL, response body and headers. @@ -64,11 +64,6 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None: doc['filename'], doc['content_type'] = get_file_info(headers) doc['content'] = body self.docs.append(doc) - try: - if self.dao: - self.dao.sav() - except: - pass def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -97,6 +92,7 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p } logger.info(f"Crawling is completed in {end_time - start_time:.2f} seconds") logger.info(f"{metadata = }") + return [table], metadata From acc35cda04271f8bcfc8d31b846d025b6be31010 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 12 Nov 2024 21:14:53 -0500 Subject: [PATCH 03/24] complete full implementation and testing with python runtime Signed-off-by: Maroun Touma --- .make.defaults | 2 +- transforms/.make.modules | 1 + .../web2parquet/dpk_web2parquet/config.py | 85 +++++++++++++++ .../web2parquet/dpk_web2parquet/local.py | 27 +++++ .../dpk_web2parquet/local_python.py | 49 +++++++++ .../dpk_web2parquet/python_runtime.py | 32 ++++++ .../web2parquet/dpk_web2parquet/transform.py | 98 ++++++++++++------ .../web2parquet/dpk_web2parquet/utils.py | 20 ++-- .../test-data/expected/metadata.json | 58 +++++++++++ .../test-data/expected/test.parquet | Bin 0 -> 32039 bytes .../web2parquet/test-data/input/test.parquet | Bin 0 -> 485 bytes .../web2parquet/test/test_web2parquet.py | 48 +++++++++ 12 files changed, 376 insertions(+), 44 deletions(-) create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/config.py create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/local.py create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/local_python.py create mode 100644 transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py create mode 100644 transforms/universal/web2parquet/test-data/expected/metadata.json create mode 100644 transforms/universal/web2parquet/test-data/expected/test.parquet create mode 100644 transforms/universal/web2parquet/test-data/input/test.parquet create mode 100644 transforms/universal/web2parquet/test/test_web2parquet.py diff --git a/.make.defaults b/.make.defaults index 3a7f690cf..51eb984ee 100644 --- a/.make.defaults +++ b/.make.defaults @@ -475,7 +475,7 @@ endif .defaults.test-src:: venv @# Help: Run pytest on the test directory inside the venv source venv/bin/activate; \ - export PYTHONPATH=../src; \ + export PYTHONPATH=../src:../: ; \ cd test; $(PYTEST) . # This is small convenience and the image itself must already be created. diff --git a/transforms/.make.modules b/transforms/.make.modules index 9a7c80c98..0dde12b53 100644 --- a/transforms/.make.modules +++ b/transforms/.make.modules @@ -19,6 +19,7 @@ TRANSFORM_RUNTIME=ray TRANSFORM_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).$(TRANSFORM_RUNTIME).transform venv:: .transforms.ray-venv + source venv/bin/activate && $(PYTHON) -m pip install $(REPOROOT)/data-connector-lib test:: .transforms.test-src test-image diff --git a/transforms/universal/web2parquet/dpk_web2parquet/config.py b/transforms/universal/web2parquet/dpk_web2parquet/config.py new file mode 100644 index 000000000..b393de3b0 --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/config.py @@ -0,0 +1,85 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +import sys +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa +from data_processing.transform import TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from data_processing.utils import get_logger +from dpk_web2parquet.transform import Web2ParquetTransform + +short_name = "web2parquet" +cli_prefix = f"{short_name}_" +urls_cli_param = f"{cli_prefix}urls" +depth_cli_param = f"{cli_prefix}depth" +downloads_cli_param = f"{cli_prefix}downloads" +folder_cli_param = f"{cli_prefix}folder" + + +logger = get_logger(__name__,"DEBUG") + +class Web2ParquetTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=Web2ParquetTransform + ) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the Web2ParquetTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument(f"--{depth_cli_param}", type=int, default=1, + help="maxumum depth relative to seed URL", + ) + parser.add_argument(f"--{downloads_cli_param}", type=int, default=1, + help="maxumum number of downloaded URLs", + ) + parser.add_argument(f"--{folder_cli_param}", type=str, default=None, + help="Folder wher to store downloaded files", + ) + parser.add_argument(f"--{urls_cli_param}", type=str, default=None, + help="List of Seed URLs for the crawler", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + if captured.get("urls") is None: + logger.error(f"Parameter web2parquet_urls must specify a seed URL") + return False + + self.params = self.params | captured + logger.info(f"web2parquet parameters are : {self.params}") + return True + + + + + diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local.py b/transforms/universal/web2parquet/dpk_web2parquet/local.py new file mode 100644 index 000000000..fea14b457 --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/local.py @@ -0,0 +1,27 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from dpk_web2parquet.transform import Web2Parquet + +# create parameters + +if __name__ == "__main__": + # Here we show how to run outside of the runtime + # Create and configure the transform. + transform = Web2Parquet(urls= ['https://thealliance.ai/'], + depth=1, + downloads=1) + table_list, metadata = transform.transform() + #print(f"\noutput table: {table_list}") + print(f"output metadata : {metadata}") \ No newline at end of file diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py new file mode 100644 index 000000000..b6764015b --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py @@ -0,0 +1,49 @@ +#(C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from dpk_web2parquet.python_runtime import Web2ParquetPythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # web2parquet params + "web2parquet_urls": 'https://thealliance.ai/', + "web2parquet_depth": 1, + "web2parquet_downloads": 1, +} + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=Web2ParquetPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py new file mode 100644 index 000000000..535a74ca4 --- /dev/null +++ b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py @@ -0,0 +1,32 @@ +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from dpk_web2parquet.config import Web2ParquetTransformConfiguration + + +logger = get_logger(__name__) + + +class Web2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=Web2ParquetTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(Web2ParquetPythonTransformConfiguration()) + logger.info("Launching web2parquet transform") + launcher.launch() \ No newline at end of file diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py index 6c643f074..012460443 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/transform.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -11,59 +11,76 @@ ################################################################################ import time -from argparse import ArgumentParser, Namespace from typing import Any import pyarrow as pa +from data_processing.data_access import DataAccessLocal from data_processing.transform import AbstractTableTransform -from dpk_web2parquet.utils import get_file_info - +from data_processing.utils import get_logger from dpk_connector import crawl, shutdown + user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" -logger = get_logger(__name__) +logger = get_logger(__name__,"DEBUG") class Web2ParquetTransform(AbstractTableTransform): """ - Implements a simple copy of a pyarrow Table. + Crawl the web and load content to pyarrow Table. """ - def __init__(self, **kwargs): + + def __init__(self, config: dict[str, Any]): """ Initialize based on the dictionary of configuration information. - This is generally called with configuration parsed from the CLI arguments defined - by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, - these will be provided by that class with help from the RayMutatingDriver. + example: + kwargs = {'urls': ['https://thealliance.ai/'],'depth': 1,'downloads': 1} + Web2ParquetTransform(**kwargs) + or + Web2ParquetTransform(urls=['https://thealliance.ai/'], depth=1, downloads=1) """ # Make sure that the param name corresponds to the name used in apply_input_params method # of NOOPTransformConfiguration class - super().__init__(dict(kwargs)) - self.seed_urls = kwargs.get("urls", []) - self.depth = kwargs.get("depth", 1) - self.downloads = kwargs.get("downloads", 10) - self.allow_mime_types = kwargs.get("mime_types", ["application/pdf","text/html","text/markdown","text/plain"]) - self.output_folder=kwargs.get('putput_folder', None) - assert self.seed_urls.length, "Must specify a URL to crawl. Url cannot be None" - + logger.debug(f"Received configuration: {config}") + super().__init__(config) + self.seed_urls = config.get("urls", []) + self.depth = config.get("depth", 1) + self.downloads = config.get("downloads", 10) + self.allow_mime_types = config.get("mime_types", ["application/pdf","text/html","text/markdown","text/plain"]) + self.folder=config.get('folder', None) + assert self.seed_urls, "Must specify a list of URLs to crawl. Url cannot be None" + + ## users may be tempted to provide a single URLs, we still need to put it in a list of 1 + if type(self.seed_urls) is not list: + self.seed_urls=[self.seed_urls] + self.count = 0 self.docs = [] - # create a data access object for storing files - self.dao = None def on_download(self, url: str, body: bytes, headers: dict) -> None: """ Callback function called when a page has been downloaded. You have access to the request URL, response body and headers. """ - logger.debug(f"url: {url}, headers: {headers}, body: {body[:64]}") - self.count += 1 - file_info = parse_headers(headers=headers, url=url) - doc = headers + doc={} doc['url'] = url - doc['filename'], doc['content_type'] = get_file_info(headers) - doc['content'] = body - self.docs.append(doc) +# doc['file_size'] = int(headers.get('Content-Length', 0)) # Default to 0 if not found + doc['content_type']=headers.get('Content-Type') + try: + filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') + except: + url_split=url.split('/') + filename = url_split[-1] if not url.endswith('/') else url_split[-2] + filename = filename.replace('.','_')+"-"+doc['content_type'].split(';')[0].replace("/", ".") + doc['filename']=filename + doc['contents'] = body + + logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}") + + ## Enforce download limits + if len(self.docs) < self.downloads: + self.docs.append(doc) + def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -78,22 +95,43 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p self.on_download, user_agent=user_agent, depth_limit=self.depth, + download_limit=self.downloads, allow_mime_types=self.allow_mime_types ) # blocking call + # Shutdown all crawls - shutdown() + # Check with @Matsubara-san as this is preventing us from calling the transfrom method a second time. + # shutdown() end_time = time.time() +# logger.debug(f"Way After: {self.docs}") table = pa.Table.from_pylist(self.docs) metadata = { - "count": self.count, - "start time": start_time, - "end time": end_time + "count": len(self.docs), + "requested_seeds": len(self.seed_urls), + "requested_depth": self.depth, + "requested_downloads": self. downloads, } logger.info(f"Crawling is completed in {end_time - start_time:.2f} seconds") logger.info(f"{metadata = }") + + ############################################################################# + ## The same transform can also be used to store crawled files to local folder + if self.folder: + dao=DataAccessLocal(local_config={'output_folder':self.folder,'input_folder':'.'}) + for x in self.docs: + dao.save_file(self.folder+'/'+x['filename'], x['contents']) return [table], metadata + + +class Web2Parquet(Web2ParquetTransform): + """ + Crawl the web and load content to pyarrow Table. + """ + + def __init__(self, **kwargs): + super().__init__(dict(kwargs)) diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py index 728274265..bbf9101e3 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/utils.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -1,21 +1,15 @@ from datetime import datetime -def get_file_info(headers, url=None): +def get_file_info(headers, url): # Extract file size file_size = int(headers.get('Content-Length', 0)) # Default to 0 if not found - - # Extract filename from Content-Disposition + content_type = headers.get('Content-Type') try: - filename = headers.get('Content-Disposition').split('filename=')[-1].strip().strip('"') + filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') except: - filename = None + url_split=url.split('/') + filename = url_split[-1] if not url.endswith('/') else url_split[-2] + filename = filename.replace('.','_')+"-"+content_type.replace("/", ".") - # try to find the file name from - if not filename: - try: - parts = url.split('/') - filename = parts[-1] - except: - filename= url - return filename, headers.get('Content-Type') + return filename, content_type, file_size diff --git a/transforms/universal/web2parquet/test-data/expected/metadata.json b/transforms/universal/web2parquet/test-data/expected/metadata.json new file mode 100644 index 000000000..dd65c2493 --- /dev/null +++ b/transforms/universal/web2parquet/test-data/expected/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "web2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-11-12 20:39:44", + "end_time": "2024-11-12 20:39:45", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "depth": 1, + "downloads": 1, + "folder": null, + "urls": "https://thealliance.ai/", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 13.7, + "gpus": 0, + "memory": 14.5, + "object_store": 0, + "execution time, min": 0.016 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 485, + "result_files": 1, + "result_size": 32039, + "processing_time": 0.94, + "count": 1, + "requested_seeds": 1, + "requested_depth": 1, + "requested_downloads": 1, + "source_doc_count": 1, + "result_doc_count": 1 + }, + "source": { + "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/universal/web2parquet/test-data/expected/test.parquet b/transforms/universal/web2parquet/test-data/expected/test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..46dcd98151b8f8b7a1e17974a14569e978aaab0d GIT binary patch literal 32039 zcmbTdbyQnH`!0&RyAvR#xCV+B3DyDyiaQNZEI{!h#ogVDmf~6xqJMT2{xcyT-F zcmFu&uJ!xQTKE1zHaoNT%)IZs_B=Dul+zZZ#Gn+VR8gZuqeLM?LE&iPe*n-gprBBr zprF{gySuna@bkOdT3b3g+F3e5tobbM_$g5+$tZz5$TvtRu_&<^L{T`X|I4E^|E&Sp z`+tVQRYgHTL*AgFqN1U&e@0`VB%l>R!6KohLHBTVq$Hq1qdG&SN=IXNWXJjcHxMlk zQbH6;G!*0q3G#x1g1i@_eD(j5m|-0Sg$CK=Ztd;PZ|e?qlmtL*EnVHL-K9O;ZFt50 z2a#$1+p{$PwdH>nIuh%D3rzrVc5=6NayNJX=wkgJ>90|#a?#j-VzB=~|0n-{?fd^S z>i;(l{O`3kx3uH^CnVoLh5U~i_%F{Q=ki}#{$~xu;{9(8;MmwXT02=n|4->>aoA6= z{~zi9OTYh@5&uW&V{;fOv**$OPa7!cFGWWIxubRXh4K=B1sqa#y1KgX@X^CUySoF& z-z_fMv@aPNHI#Ze@LFX^6lm2%;nm_1PK6LmmtOjs?N#*xfQLNYg2E z(dp8uQiAdghoXPt;DifLprgwi%o@1I_VziKLe8;M9qp+6KXeW{s}%gk1(KT4;8Px< zzU|FC{ahd7lQ^4O6jo7YsH&_H|9dnnc?1}(NvHctJqeA*Aps8Y}S*B%eqo_#gk(l7lfR`FnVx~5bL{-N2d?2?JO_A_dS#PX`g~04l z6O3KeUpGNQB#RQjMHgQ0#V{125B{jWGp$2R)M0o4Vw|Wd`6$OWqag6PdT{oILx0E( zh{om_ZQh%=;(->90w``af9Aq+0qV)qCDF3kcy!Xj!LK4GThx;Gg5}>9p#-nL_98diE;9Y~AnES~4*M9dYAEj(!El2AF+rr)MF=z(GRr6B! z*?!_>PsHj-RPIHwt1|NCs!EJan@oPOc`Kr!FlLjTI{vks%2pKr3sJmP(R*<*ZL}b| zujLFw#Yq)ednK$PR4d+sr}t^XV6ZL(bM_+R`zI}^AAO3+St+W zlHXVbl|nRzQw0nz=cxD^@=0z~5{XNK3c3r*HP-?o2?R6~n@?~BECLQ$eXFU&byF2y z;gU)y5`Xz@s70;wO${q~Uw|=h#)YZ}{o@cUl5gTd^ydYeH5S}Ov2F30)L4VUwh*2ljr z&n7PlUuN>i`fQ={Y@HX_GvOyPeME`QPWcOt`H5AwC9If$t@SL4AW-(FXKz+wDxEII zPuaM=EVbgKIHVOu+RMI;rn~0~w~Y2i!E!;P=E+P9!YH!I*eT3bHL{oAnjXa;0dqpM zyJ;$WOM*rGyX1q9%6m(CmY4mPFHrG+=hTY^W@DqqV*eT~C`=_4pAs62wl$m)5_AE# zcFVs@6sf}5MBTD0z-%qB`lhBt8=Fr{oI3TGcM8xJ90Su*)2c+AqM(Vy#xht7FzLkW z3{Hy$p5TO>x=P(@BE8}5N2}hy77=C1DuglF;FQOT68mhmw^IVIzZ#M%QV9KzRBuw-w=7i3 zGE|%@A!Buyhz}2Djml1u;7P2qC{aoq=Md70&Sq!(T1@>`QI)1FOM7cHdPr8_+7vyU z5r0Q3k`3wB0+S}h@3n7Svc=?Rgrz==s#1Oj#W!Z67KQQ)n;SHCZQzrabXxzpmjTt*C5GD%{|aEqvi29 zkk=lFDh)HDrf2fDNRHQKDQqGU7>e$<2XoQ%)1bz%Q4?}E@N5-V=|AH%*48NpMzKet z&=85O1)`&^dDPoH%i9h_|96Y>w~$JhR)tI*xq$YB+(q;}zh zgmyDuiMZ+zMwu=$u$MFp;zd)bY{>?K;}vKfMtcha>6Bk3mz4dw0gr?VD88;iUfl(f ziiWFz@ir9&wn5p0sC2p#Q9uJal{2ZNpJ1X$^5K4WU#T0tO)Pz{yS~N@%-s;)r5K=U zGEk9+f+E;nyCHeBD0c`p))BWMNpMK?oO`osE^@;Cp$} zZ)JGdly2&c!PzR>{n?u;{Y5pZTT-Qlq@2h}stSFwi2WD&T`Pd(MrnOjbdG_g6xB0z z9JGeXz-$2Wn{UB8z-M^LZ?_Gmtx5;$`G9z39}Eur;Pih8=pa zsA7{4m?ejnk@JqclSsfKd2cuvLA@E&XI0S8Qi+c(%=liSL^#jZC(uG|NJo&#K;@uE z-oCxs(y%IKwvK1Essuet&6m(YuOYhBECp$r#wuOjt1V@*g=&Iw{ zFP<5t{fOd-a1Mvoqv5>N0M^_zs_p0DVfJudnGns?dQdUb>l>zB6gqcVQzq$g?1Zn%xIqbO>)EdyT z!LbzTYUt6~ujI4waI0HDL`_!DvYKe9i9p;o=sQ}rLTcnm#6$o9rxpo;szwTqky{d( zNF}?F)weGznBbyX~L5-Yxsept^RdMoSrpvk3Hf&@?vK(Dqt3h*;A}XJZS>SI zhiwuHwLCyy@}@sPC+@f22M~7sMMW3z5I}j09gY$aEheR3l$Ym5Ny*s zh}*y}Uf6F)fOLACK32w9?CF*=4r?71TUD(-?=FxL3kx=h1Lc}!HL6&kw+l*lWI=E! z9*|Whv8;fFRIVosD;$rEjGcoJPXHufRbUYuih~o2O^In~^NjI#RvLO_6P9LR6V@8a zMsO4Z@pO^^X&BDCOzhM%-m|F>LC#w7(^7o7Dis{KSg|I;A>l}W2&E$zQvLctaA-G4 za}=0?HC5Xt+MLC@;Ykx?LeYQ48m77JrP^qx2+)5EcZj3_OT~vpn{4Xgdg-9BOp!TkUwgg+6 zycV0ZP(vBI2)*sAB)zvlRkD6NAW@hN}VOj03Y>IDxmT6bm zGm&Q#eGza8B21L0<43Xms+BPb)$pz5+j})d1jN~4q>xa{+2DmX-FB`S}#O>rmMSkx(SoYiFFraA~H_{ zwkD2;0EeCynAn(C7>Lc%`&L5Y#b|ez7*nza_z%`M61LfTX$A#cKjg2z-s=0_yRPjC z?;V(CKD|55oC{U?EtkCYqOmzfNGN`X_ttf9EKhnd^ub#um&N#qrWd;nyEme@`}c#h zYO;wYGVQ!pwwhmm?LKm6ew6=G44DO)cqTEQrO8>=RkdGw_tWeQT@vx#ED8CTu)kuI zl@j0}(0hb1)z?Qr5_k_|-X$eNG?+N>RLHWxM4R1;GiqAGhLJooejLTg*>;6A#2ki( zYLTZFe3Mo?LeK4nUI`5slv(E#I_&NR5*BWDGjZ6L@5E{%&9>~5RMC(ErXq2F2&bve zu!U40or}Ms_@C(! z$6_alE2ef24rZnN@Ln5LX`TB~$z+)xoLPbGKL?y&nyxV|H z!;VLI-NFvM`@&u?sW|_Mh96z?M&2DZ^~ZH&wq9M#HqCf;KbSe%dE$S{GbVk~TM2)# zDris0;D-xA@t3daVPQ3DzAe<$O>o-E7`lvZd}!gFj2X0$zqDh?;q-XspyM+ELnkIQ zFXpeP1Ab9T*Xe{Uk80J6>Kaz-3BCKim{yJ$h}p~t?4E1>GrU%L z+z5jnrEzu=wf9`*2|*9cvS7&X$TCcWJw5IzXozXzbhKSjx?L(;IPdjKV^6zSg9WnG z!-N2oztAC7%z(GwWEO_3&Q2x+gL4PuSt$R-SGKuW9pcfFK)UDhWSOd{BIFd&e=~qA zTtjw0zgO%Q@<#lsNy)b9%bopK%wm%u(dfJ$2_jXI*dHHss2YRSqHQZU zbj_%fVHegcb~945VBXSm19E$YGOd0;yJt}h+F6`Yd^*`F@A#%bU(1R>6{%<#xDrte z%Jw>TnCi$HCJjuzf1);}z@k|!tZgbAnD`lqZD%xf&c%SK@<~7~RrkUfR-_exDv%9G zj6xP&W~cG0h?6GFM{?U`b+&yx~`yyXgP zvNbWPaYZzqMbT*{Kl9lGWpTP*wQ@Di4KfYteFE_(^zCluSJP%SGn`+~)GqALkA0BA(?k}l zrxwvn`)U#H70QhG8Ng6G_Ce$_+3C%=mkovfEwTqh0e_bL{B2tr=K62V7-I5~aj&Kt zzr0(4^Ui}sgg*i_#ud#(1Sn9FA#SC{2YngrMnw~$e3n&Fkp&zn`L$k@O{^-IsAH(A z#8?WXjt(j*rvB&c?*H($EI%PWng~TqE*G+S#a@s%5W`NxG z;!{5ZKb8)CGW`b#vTE_mlfm(9)XHZ9%;IvBktaa*Q3ARFJ7A*RrkFYwnbfoRVnIgQ zAS?{vN=x`3hL82BxjIPZ^;<2$gdkrljJ&s$x|7N!1gV9z97a1~ zSfK=Qsw@hU)ojW#Ok@KZ>?s2qx~H~c>dIJn(a2(xU6w*qE!hslG!-mMh(0t_4+Mjl zC^!~_#dsR|MO_owsj&W%9yA=gO#Xts9t;ZB0uZYTXuYSX z0GawRF}>coQT+U<#%e3)VkzjLLn)+kBG!L&NI@YT7plk~ttyl>ZDK1+0bH>=1knmf zS;gV9%F1PdDR+MAsGZBR4WEpES7){5Q||+xx<(g}$`#>K(UGxcit~fmPfq$~SF*&* z^VHb?Bht=s%(id(Y25zQzQ>376ylh z%7^cCT^3}avK8AsPf>NC=D{J=AwhOdE;NKYojP8^VXN2WC}=|vj7LX~M{&sbf~>E| zASMflKD6HxZIG5)&^?N$hY`gN0+h+tzw3SVs$jHARw+~b&`QXz%6d|hn{@~mO`X%; z05tnJ%9qElOcY*CU=`UNhtlb-=T+X1*yiEB;T;*Iuw;^T8oPP7>1HF5tA#O)&!N%>X4 zBrYiXwy0p8>?qS`s|rcsOceertJSqahe?9);=U%Xi{B^X+dfahg`{AG;&9n&3dn^d z^N?aBl*spLc@w=(-#b+SL{k@91S^H(#VG}Ch{qS&>80i-*+&;3HB2jTZL4Q1i$_~9 zoghablby(_RY=OIDZ4ATdz>^9iqy1hgj_#hv?9vHB}g7MtiK4waNzMVHqbO>8YEGq4{7RY>)Q-)FsW?H-JI=r_BseM#&E#_GvY`2_`;KVRCyWWGx0Bm~qfk-wMVdQ%CY~>>ywn zV9I6QZ8mqCjq$pQlkV*nc;p>>A1k-L2!rgD%RAwI9-!vtA1k3G62U@2&H{s?*&~rq zSwbB}KpYqh7M7(90+9ni=pa_QX*&*`)ZEETl2c-C_QtqejdPt;cOkGMFT18l!!uDCAeZX`$jL(r0Sa4>Ii_0@72)!QnjDdV!MJhw zKrRrEg(eWhqj}%-a;{2;cT9twyj0+e^e}P)$T9*b1_;3*#1RN%QP>5w4MGJH!Bp9e z_N?T%WFRgcp|WO@AWpR5U|donT*~@&$Tc$gk|!qsk>w;<{vF@hl&wk>&QgcCktg37 zY7iX`m@Mi3 zVj7Lpp|Wwg02&^Z!AX})Vc*UI8Xnm?uHW@>C6Q|dnZU- z#?jQ|p!zNA!hnlG3e28tF3ZNtj>S-ZRn{M@gdy^VqA(~319C*~i~RqGiWuN+akR;# zH#>eD(h6HRM4b!t$5kdMTp0ei7Ai> z<406kMD)i3Ab$3^71D>miGy|~nUqUm(fx9z9O*W$gRhhd3Np8l@0O4jo@#pk3Fp^V zE&7@bY&?%*B|wW-l!)(lFZo(Xr-_M!kr4Lj&>tR}bs$EbU1W{aJus>ihpz-um?YQO zY*7JF6F^17^_0STI&L%HrQm#Ui%usX!BjF5QEjtr9DH=VvCDtN}&><{%wt%zBqxu}lez5#t{H@Z7mN+I=;o%v;S*4?CE4l*b_cKbphe>qh2qeF15Z$J$PKb_r~y(2 zQ~>ES6yih@l}xaA*MI0zy7+I2p+b=*)xe+Huh}w`*m|h{A*`TK)N$a)y=Wu`gCUt6 zAMh0Lu%zH7GgeqtLMW6(4)Z?-qC*n2E)Qly$Hg`g^HsoULfR*?&=^9-*tqbLq5@kg znjfPbw#G*O5>LurG1Eu*DRU{=`PBK?FRk4Tl`+f6FpDCFXL$V!A zG&M~zujmVc7db1^?c^Z8hkN`IvEHRWpzaAHsS5k|DyYF2{p7rNV%bQ6ra{bZ(Eudj zp)qVxnN5MDa<((c+XG9G1hXFDjDakw>cEtRR2jQfa#{(JDx9sdRHPF~aH>mOv@=LT z!m929iBBOSS^b5x_I0Rdpt%3=a9GvshFXM~14w#Ac>^W5dscCVM>{&b&z(sJBP#c| z7+rMs2lrT|@i1L^+4h6s@5ZNHzHvK9Wpdlhx83D~WgHveio}NdpJdnbHlg zB&8N*reu>6kg5ptYYwpFDd~at!i2MS(BgE)z|Yg<7&)T~G)r_tF$gH#`5m#4+f!^D z$z_Ie3Zj%?K}GVsY}#@f6%CHoz!G3Na(4TGocBVmNq8a@Z$U`VZ=*N%PSB%JNrtpo z^B0m07z(987M^a}Myl33(<)!+(lo^tiL1*vv$}G11nacpt1?k*$8R%=H{y6>YK{@m znBgVFXYPS;`r@+~{5%JU^FrFE&SO*Nr(?GZYIswn+u%VhajVC~#ljUW_esElTi9yV zRo;Q`_pz$wwf>!bN7}}o@wrt>*q!LFowZiaZLd4^yG;pCkE9Dx{db|&bL}0o2lXN{ zJ9pV!V=MArVR5`xRRWi=oqe<87u>`$&2@%_jn>C_0n3Ir?fyH9iF0yad<`A9`=N~c zZV0o=-iUi6W?>VYv;gOIhwUBf*cnErkm|93EW?LY&o<8kkN07;$8K|vW}o!tKee>L zt{@}Fi@8_K+=Ly_L%h7&7dHCv!z#f?(RsXerT2Y(9`m=SP(Qt+)$r=1_P99Rj}!C| z2T9nm0b;=aaPb=ITlK^9Zl&o4L1oBt*8yEx{7z)b=6BFcdF7qo-ZO|875_SUA+mH; zCF>PdlOf`8u7BUuK8I+!aX|E&J)C;FeSsDup|$rH=mD(SoaO4k-wS$(;I;}|IvA%M ztRh^Mcz<{l7u(eS^K&qUj%77v>+1X3!zItQ#!U~6;=8a3JcsI)anaU>7ZD@bBDcn7 zSC{RvA~YYfMCS1~+hPtj4!r#CPKz54ua}%8WnvwrJI>5D%@b;u9qHZt9}pXzdm|fO zCRpaiF$lARUie`re zmz>S5JRFOC!^SSz9Nh7jE8Wbg4D);oeft#QL=7_TcT1I5h)V;=(vDd^1i`@F2s<+S z_;5+j-D$RC#@OpWF&s+`oAYwKG0bs8EOJjhbTsy3!4U_-@YSjnr}a*yene&y!uUx) zyU`L49G2-4%>|;Pv&Uv7fzH%;r&hqdlZ5M!$HQ&XvS`_ncc*#Ii>2^ILB1>Q zqwfUvoA4xaL{sy9Dsl>Qdzr_QP%p$h_Y&aPVSfWbZJI-S^ul@;QRHxcSUS4s>AwAR z(NKD_s|c_rii4Ayy}zuQS*YU+mAyIq0p4%ak6YlvS+=r1vv zctQzkB=zSrjcq(iN53-|TdL7p-DQik&QmFE`|}@Mc1{@Qi1RyM%-vO|-F?jBu3c`J z8&0VApQBp8bbyYvhgClwm>oDg&KiGQO}BJ#OMqHgU#CMVy<(xo9nBLaH^mNj!&lB$ zb6(-lJlBws!(kq!W{rHv-Qi-j^y=3iM;?cZr%laop8T`gTCb|$ze7H(nnK*)AK=A) zC~gydf1iGl!~0@xC-dU$XTpapHxKO2jOtmlJCB|5ChN+)n8y(_VRyue^ke_7%#pCw z1TWlS9O`(yE9&z%V5C~Q)~)`oy7%guS9#Yj$8eQVV#V{*@nL`Uy3pXQ(cNmlkzG9g z!;Ca+rFuf~GQC|y-u$wWes<5r-w2o?au4nwAk)HOC!?W{_V0aY|ToU3)76+w=o`#mw|d{&Er8(*AhhagWq| zz#-P$L)wn$Z6)07y7uJi;R2D-GDow}|H=Bw`Z#TG#j~@27k`!e>3DOlAJ(mZJ#xQa z#T!%Gbv3)-09j3_t@V{|Z9veNOE*~W?bq-^)T5@L?lp{v)@x$$d~0`YOLs}XAxafKXDPkjLc5PUcvie(M@0<_6=r=telK>Z1MSwP;6M zShZmPYO}@%rmM|bxWpio?I5HU3h&=@WK)jlOuX<99jPActYUU*?7#kGTq*2_b0sN5 zHHTQJfAAE5J?JgL;VZW$UQ4A(tIqI{714%EhdYqaUC%WQVqn-!;1#0Kna<_A72RAQc%vzUtuo*Fs?6~sBv+9g( z5^03J$y?`dl#pz$?sB^OGhzgRFu}_=;Y)Ogfz>i8ng!2FHy)?Pj_{Qw=f>6-X3idm z^P=2L`B%o$o+dVjvzNx3zOd2#xC_yMX3bNq!aM){>Q3pEug4I{JBhwL`c3HK(MWZ5 z%X+KC%Y$nRguWEa`CQZ^!Ktn3!q+*c!`ZR3so2Z-YxDJ8b+x(s!7uZ3Xfww@Y+U}DR@Q4I3 zsUk=W`4MGtA@mFW@aHdVfa;rv6ZCF=E=4XhMyunH0Uq!m#JzMi;&oA3YYc7AfQ=r^ zwWM*HTxJH>Q9g3JT{Sbzo%pJEq<{vtn0&XOr2kq&w!*WEx zt%fi2Ip+MW#(K|XbV#S)-Kr12Ih<<6c@}<}<4fr6U1a@L0a-uC{7$q;1e31mu55R0 z&za5X)xs}3V4sXPFKf5;1M;bBTjOFlm&`|wYt5f7?(mNeDB=3G=SDsc)v)^C{O7KB z>H!~HpzYV9`tVhX8n=hA>hIMv8Pv^_#g7#TPy8hu z=R3?Rxov2tpW)k;`N|h(8?(L-9sC1xO}9SAPUgk@rA~~s+a9k`u1@zoI#*=OoQ!MX zqcNvOJ~{mGiIM7q?K`Ta-3x?65Mq%&=eDN#Pg4dp|JB*S+CEf^u?9XiL;Yxva5#;F z-gNLcACLHVdziTx*7lu7A2>Zco_Qe0UG}N%gST`X!k1bP_@9DDs=p`9pm{l=et6F|e5JwRm@MPL=U{Xn_VGcqtn(lP zQS_nRj{l_x!o-sQk?w`irx6VVUrkd_X$P|*qNnuxk>STX&d$|u)S~>&iD?;vX3&eK zrsfCOqR{Tl9rVW7C8u^}-r)x7gv`}0#{4UAs^-{>j0e{hAszUiG!f_=eEAmtV-$Vf zEe!H7+q_x@oylnLd$C~nX@yQU72fMM2sMIs-X+*sKhg&uypgjrh4OkAGD>}SwTH` zqUmbuRp=#OciSAcxo@ZOvgi)^QUi5RZ9A!tn}ZcZsRD^1X3@|R@%Fn>!6nH*L(p;h zDa4bs<=nD?_?tnSAYyB)u2}!43#i{P+IMy&sgMj@U{i38#l`LqmF$njB_tYO0jb!o z1+vqSrv#URRm+fvOTOY`TBv~%kyxBgO4uzFi7Iq*aFk^t{pjY{=>*tUo@WcLcV+(D zU2>7;8u~6UG}uFBp{An=Qv7KpN|_L9C6bTj9D+$heu`ERor_OWR#>)UsU{}CZh=Ze zPDn#e2`W_aVYQAfRqgqA97uqjK@{0*D4MNpSEj{Qs)csEAIL&YpfPzzXbdk|fJGo4 zXhcKqB#(TXj)A8xNN|e%J`_!W9c1g0m=@bL`q8pCRV^(9Q`8_3jVLxsNr5#O2R{xA z^t|7W6(1t6fZqQjmpnXzGvpsl2`D!JC5RiAu`8Sz0z{su_#nW}v9=~gHW(zhPL8)$XZcU8(a`JxD}X905BoRSMpVd#;8z?- zPmrM27;Pa^J-On7|2hsvvuF5sEkZ$RnGOW9nEZi7Ncb-F2&rzA!YIO6-2TvBEJC8u z7Q4jBE?Q)3{||RQuIPWNwY7csZ=8RdOc7YIQOHfGe+z~IDME#y*#pMFsEUfyJ(&f8 z$limCu>XxMSZEoHL72JTMMCU|CSql~_L72x& zNKjC_ZMQ@txxOY57aLh&Dn|Rc6p)L{mE($xThXavkc*=tsnPP%zjYYd#qyXc%t;#T z+5j42rA7NB0nHNR<`_6SG%_56B%H-U&i#)asipxD&=sJQJV?qSLW)Q6)*ZEvLVXq( z1#}DA0Wl|p8xnvR2O4ad!j+jA)EZfh%Gl8`ilO`iy@55J4;AAgGZr0Z zP{9}aQfTRY0zob+3;fu%B`D;P4TgU^9$+$fr3lcWjvKpW_pAO+Zx)eWfV_p?{|FCm6-y{tm zGc-N-@WS*ozW%dNF9AC?z42d#_{YOn(=NT48-)JEJpd|wpEhnG9;Z+%{iQRAhhyHd zqK%EstY@O3-2h%rlc}%M#NmxNH(x`K4YjHJSlV@_Da7+&Jk7J}(FDQSh}bhQbbr=? zv&2(hyyGx#TcbJIM$nt(C(F4tk7;<5D4 zFr;qNt$wbZ@~n1XNAFgA56}1m!=~GQt>51rZ%YUN3Ce-aLk8j30%-R+YX&CsW9`KT9hTotp`< z&`sxE#C>%@&N=p`(k^j^^c?lVrCVP+>@jyiXs^QMZXo@Jdx?qq@xb>OUe%6v)3?22 zWL4+?V1BwAQF|l5cep6A$J43|=aD#f{ksciSngY;X>T&#eM*?4Ma-@AUyeT9)w&(8 zQYl@*#?6*bMKAy4U!8g_?X4dABW7ug=RY^a9h(0QJ8JQ@ba!?#cOUd-{QIeMti{1A z=@2sB&MzN;m~8jBKAO#vbT~My0C_=2MO(S={nqbdoY#H-tXwlOBIc#~&35nQ;D^$D zZ7(jvjvfwHSm4%U^oZG|Q^ed-J7VWvE@GKVDPYBBwKJ}rH3-W$OS*F*<_g!GBP&P(!P3KBj)*G)mPYfXS_WrFArWXDXC!o z{^*c5bh+a&`=Ou89DcZ0d)U~yi&)7!hn-Ue(E2nwzd!hc==HxJU+4&sgpEG5nFl=4 znOuJkpnr7uPPi(<@3(q*=({?0)$YYO*X{);>RjhO61=~rUwygdalCxy8}QVA*e=<8 zDDC+D*gxd^)91}&x79lMvcqQcs+pbha=<<8v6eaS zu`@j0+^)Jw&}^_ndSzhT2jMnArR3#>U);7!zbyG^eYEi48ZvPRc{-k3P@bbN_WkRB zlw<64>;C=HH5Fk7e#?7C^$0W$C4zC6VUZ7B`p8wzlTg@kdyD@AW~h z@=@Gamb0J!=SJuek$?MgUaj!emB(NQq49@=xxRs8e|nvxW&#+D%9oyNVy$5wkIm{% zljSdTbVfYjpg<;IWMzi=Z`kqhD1A$Xy&&W+oAMY|ZD{=0dh9|DmSjjt<6qp? z5wYr6{PucP!Yyv>w4wCaKfgu(%5R-JaaDBZcJ$7S#qsX?V;;BF5IuWLqld;jAg z%zV{tm)Scd4{JV>Q8k;1%`sM8SvQ^ai1MU9P zW}JFdRf|XMAr#@}u|(H~vv9_J@k27RTQu~#)wLtOk!ck@Y;Oej4wcYNRB{H=*eL{&nIfBg((3Cqy5 zG49XpH8iHm-QnP9ezybK?OcW4Qky-j%U) z)p5&10F-)8{NQEteeL*?jLo*l6>DwG@ZrIIS{~$}=<*8IDdF5sK*$ z6Nhd!%L#B}^LziXCb=tw0b2kZ9#46-cG=eP-jSu!p?Tc&DGM>UxMA!$_bDDx4cZDj zq+`DGuRdD()FyxBW7sA?yXM1vI9t#C;`m~=-@f5KC~IxJl!`GlDLTU^^ai?!KPD)l7e7*^W^{YupTFn62V3!Pj1 zeBhz+@%qw~8xEWL?b}(`B;n@rWam)*bPO;2Y2X8fX-kih|G zfZd^5@;6Go`qRyoA(`Vo&fus6&Xd*^74#wty-{_(lp(1hL@sw$Bu5-H<#}w zE6jH#4MD7?u+p*JWl;MN0rNuOG?=Q>Cy>;Qj45p%TNlH5#XZR|aZ);g!77f$J0{rz z^zz#uUCSUNJ&k2j2C=6)zXGlrg8{!B0Hnj0rRlWUZ_NN+14p=Ub5m;;_NG^VNPtxmSN%(}eU3%uKf zak^tyeke*9>yHsTuE$_m+G7J&5_4+%!6a5Sn zyMSNxGASgX&^JSU5p%LDyC>rDDAWub>dNge&S}rzigz^K**|v1OW^@u#|GhBE;PMl zJCPzXO*!WN)Y0t4GD@pI82QXKN-# zvVRP$I)cQ0VSUN&*Vvt~VDb5xijaT;O|U*|n7x;?0)p88GM|;x4sV9o5{H=R25iI~+~c2iFnhEZ!w6Wk0YQ>G zGLkF4g73-DFugc?$|bo{wGc_m6l6n*L2H%ojJ)dOaodR+yf_)Z>Y~i|*omqDY>6)z z3d@6QQz4C~1wp}^9&(b&+_zm%u!wv5EVp8eMT0Kfxjl~i5!XF&l)zJ8^C6}0k`8fH zuFRpIE$>gCvkn_(wq7 zGzV=Au|*LJwnWxicLF-{_gXW9GO~#ZC8Q;!Q(~F@-scCA4XiEV-WrgG7+cVNI2qrH zY^~CL_NZ+xS<0J5xYxtb}B0=u*xJD{F=Yq8StPmW~4f_g=?z zejN% zYVwak53>T3a#K*EJY}rHWzq|6CnE7!qn?RW%ohG428;5Xw@mEZ(y!HL23Afj6C`y8 z2UK1P>K$gI==+_T zU9QQoVHM*g2g$|=WUR%0J8Irc^|D6ko)#mt4wh3-lKOK$^5{zXDuvT*?Sg%!P!RL- zldF0FpA|rz<7d0JdHdf<5%qUXCSl648QKn*G}2r;dGjyST_0Yu+_pupQS9E4-XvN_3;p1=h}x0Ci0|iQ?$ctU${rN)h8aX5E=%h z_jJb|G4uRYORICe5=hk#M(n$UypnxSBG1C|C&T{X8BDV6zU1RU`^8)TblfdCz$Mk4 zawFMxooUUpU}7)q{hC7fj|1utpex$YqHYgZxyekK0(KZa-vA-*5A%Tsro9I>qT(mp zWLkid)=6%+=D;RD%l-4`9uwT$o@5Jev{D=>Ev;18KL057SIaeQ-oY;G{7VxxckvHsxkcTL{l_x4kXTY+W-{nNKM)Ze3fQYJg%tzT7S zsFu}}Iwwn^C|In&h;4BZA(&Rx?wR*;?moH6Ah(b%$tzz^v zKX!8#KpM5RGUJB`Uy-cNMi%AcDUpg{6YzlYX z?&n-e6ht?g4J~?5>IQ%W?4GAPoxK@|0Qh7mzE4)et3R=Ml^*^}Ja>l{;3d&Cx1en^D^t)|2;mc5HUrssdnMb=Qr2A z*TQBmm&M9n8YV-=AkNIX2+uvs3=ov92(NseSE*8LHE@+6Q?EK)&!|bN!-NoxEoqTC(QDIU1Ap#=32hIqr_F6L~Vb= zrHR@u7W6#AzRbJP2cb4HEh??aqr#u~Or*8M&eT$Mf$`HPt?VVumbaU)iDuiCSVu|} zB3oKPJZau3`IClDSnpr_8Mu_}j$PmW;G1s9INkE@$INz6vWDfn`&b%zlhE)>Y^>CH>n4VxsxW^w&>PYjX5#Ja-M6_P z_I9ZG>8SH+?rI1Z9skO{<2r%Y3#WeHWnT$MJ_T<}oLlE=s^lz2eSQ@`!5eB!K3%#_ zm4a$eLHqWqnP;s;hXSojyK6e@8COLj8&~u@#|uW{mvi)SV<78+PPD8EP1qx?Tw_=a zB5gZpzq|T{^G(k_cs|!1I7J&TRWHa-mId*ak6@4)zd-(q%r(gyf!Ysii8j{AfAB{&frL&zcOrSd)wXWWY)kVDpuDl9dBSSj@R`l?b-uS^o|)iiEZIw6 zyqv8?rC}aZNrCz*yui|FqRnCS?AIApr#i}E42ms**@3Iq9Nsw! zmJj8*)7xF&{lJHY{`6uci>xZEem#omScxGL?6_;v9o4~sZqMBlg0ZUrIN9Z$Nt`nJ zFMke>ZrrkpX_7Ei_5EBNPtwPUZdzB)ey7eUXXDUA0(euiC#NBi7B^)a!1Zf9TCC;} z*I;CT*kouM4cou_J9I*FqYP_JE@W)Cl?8wMgSN_tSvG$YB|1=m23Z_@XLo0T&89`q~$O161dYrT^kusuziv?bL%sJhJzbDp`3soBRo(l z(}yRRp>e%3s z1%@)*Ob127#|1t~WmS4-lPvbmOv)^2KKUHFEiM<)u!)aVSaUjuu_#cnb@WDCK}pU> zM;S>oT46IF1$*>QG>jLbgd(rWV|6_sVMmr(91qVQ9w`dHFE?T9uP>}n-r`Pr4olw+e!-4q3nbB?vIPl-+`UzQf-y)R>sHsT$ZFHOx5W^U+WCMj7e+_D7rH`gJJW7ISpTeK~ ze=R5;*<5v`b`Yi(E`NIu8B}x|HhJ><(O-bMi_l)eN|VO>u;|R9?}9SuMgzg`|9AnVPMTLGcz+a z%*@Qp%*@Qp%*@P84JW^blZKfYnxx@0m-gLz^X}&UyO}$UEqmA6)mm#=9$Vgzi7Ak% zYYU^KpAsd=asE?TR^+=Mj0$I=4~EQsnsPxec_-QRq1@?{DP^~3x^`mK4yq%0-Zet$ zAL-^~21zSgb=-Q#p2%GRjNnmit2!xFFWEV0FhWKGEVXX72Pb(LIm^J}MBiRuYDL#F zZKPS06q&O>%WUE8Nv|;1lGxd#>L(QlwP7sR#`2mua*| z{Xi&dMSUX8WkeTm`;+ysRjJqV$3cuQGet^AJ5rH6s_CC61;*5lBIi+4=cAzwyb!KS z=-{O$KujFrFgZsf`bqO=`)DH>aFx*COvDtTHKX@zqyEfC zjoN_RWt9L_cWe~)8^}=0A zL4>1S)l)Sun9zcmTkq$M^sSBSim3W+-sixkjrM*{vl6%ULZK2Gq>RjyMoY~cEp2iZG+1^qhv#{VE7R+N7nb+N zg_iG<(A*TM%KDk$V`!15&W;w-+LR5@r%T50_BJdcr41)@O7Z8{M>D8YF>3-t8j4&U9sk~>L3#>#i>er_HkQrdp>zd@ zpAx5!;!bG8{I)Sg@SQo?AJz*!AVm##mDHQN8q7keeY|r?bvNcaf~u;r8!Wpd@))Ms zNAIW#RjSBM4EgV*6f1!$@J?Zoz%{DR6Gs%LG?7`oruF#@OTdxYfQ5i9Bro0#PI+n# zUi&m~f1Oo_){7eIorCrm0m&l?Db4sTZw=vecn5c24;MmHGpxG16i{@6=FbR8cmKT? zmr_K6zA)%Yy0?LKa>tSp1r!M~mAzt9AFc61sB+|LDICaPXZoS1sT6sZd80qjrDge; z1V+`uF(iT@?1TVJ@sIRFH3wL#lrde znDgtOIAd|YX@5jX4RVLcxO$dimWXc<2Q;XJo4H4Dqi-j;RGdw#d6`9MP%eP;CCBb6 z-LdlLzPo5Six3dc90|`A3hl0G#%a#_TAqmTZb8kf^}TQ2U@-6+eDBS!PGMs1R^-oG)7vc*Y_Q66r3IsBySL9`y&%2Hav zE@b}s4h71Ah~Sk&G--PtsCPQxer#eH>(He?T%%rJPNcH=CgtHfvO>ILgScRaTxST1tf8p0KU`7uIiT zu8vfZnVW7}Y;I3MIWsrW;qxJSaoIshPdtN$1I1i(Vy0b(dT{1PtrkhWmx*a>3tXLq zG?L;-@>VP$(nQdu+&io*?l#kHRCSnRugNTCZ%0x^`tWrAyeoaJ=UIr4LBvk_(RKM- zDRl`b4<(=DCJa{I1&rYw9$US3gZgQYjbC>-KnyC-G!_RldL8G2&1B0z=d^yb{xMY9 zG_*kL65HaE*_cNyx3K;F6ya+bKagmz_0ELyJ73Mj&f&b;_4bYV*LNbkeI)_D6(T*l zy5VRFMuG!&FGapXMBZvursJz|WfXaDllB$p2elQt08%{S6;32ahfW2Slukr3-bkL| zZ@#8pLTuf10F=rt1y6GMl#frFGL|v@-31Ar;N?@b-B_Hd8nD_fE32_FHHq~IQ`Vr#F-P%#^gDTt_cv=T$XFX}5 z9NQTy1C6$_A%t}o`30v_O#;EjuGFc)=qa6xykq9!NbAUG&^UprPJjN8I|tT>?plG+ z4$Eh=si7a7x6}s4! zhoMPi&Kl5-qLlw2gco=8-*3X355U)2%^MA2!9$|in#%n1>znq;krD5B#n^_jm~OBo zU{yR=qWR10pCFvD5f_Ho(*KrkO~yVQbLLs<(Fi zxu%t5q-2jLP=qmR=VE4W!yArs2h8`L-A%ub94#JZ@zYa`E}4~F<&gCIN3K9&$`zY0 zEC;)Hh<##1GGW?9V3%?B1fg^tF(fw3RT}X5PvOxzN2DJ%r4Cgq^r!J*y2_fsZa--# z<%zV1{k|I=xOAk7p~v&?uy+~MYg?7s6*-e@(0~HkGTJVR8W=WBhaFdQV>&a?X!1uU z3sHgEY@^EUU5&;w{aa%JLd#Pxm&qEYl#kBON5&PM6nKrbGx0xP(6~tEj-==0nzde` zHKLP$?Ad}Jpj_M#A1xeU*WtL)1_i*p?1^h_NMGkTfAq{hwJ~D~#*@7473r0y1yVA( zbelqlnFhukl@g(NgyLVz>T=~e*|Xrayr=-9%|bW`Gi>`s?%~aRv%9N$p%sr4%bX=J z6}>JJa-WLKJyO6H6e(34N}{bqpNP&bOYfx|Y(=0f!|4(LGN@^B?dx^&>6Wkow@a0= z;}by2$?;J_@v@uUt)92~V9_n(g&lGUH%z zEB_PCSgpY=7c3O^1_J(-;<|{pn5^sKp~YZ^NJxsiwP9wF>v`g`QeEN{2kL~9TbZ!r z`-NZEiz^(;qf8tLT}xcWH{PB_oy=L=MNbgtv4qN6{^wnhsu5|WQsnWp;%$AYm95aT z1^cY_cC+~OEu9qLiQLJ);Oik~9G^??uJ6LP*WUp)Aq#t$J76~koxol7xQeYweL7Tm zZ51;w!IbQj&Vlv=!S%W5V#*Xe`^riFG_ZT0EXH<75oN7v-3dbLRiZ*Vi`xPOH?-rO zk+fzm(!rufXk*|;PP~Ri?X}vK`EF})LKypE_RCn?hJIG)(y+xWsyyovw880Tp0Hz`PK#$CK##hG5%zr0oW|(oC=3rq+4cq8vP3rd zLoQeN?pq%+>CzcSE>}3bN)i@$f^{iGWhYjzuPg%+)*P^b&tZs$W(k3YF>X#hwBror1sq5YRg~Ry8H%^h?(v3Gb}&Bus!jj;Cs{&ga!-AK3L82 z)ngwh$unepjbK13&r9v8GN+D~WmA4bx{y#1cIC})j?E^=1m1#`8!G>)D~a31j`7}I z{Ah}R@IZ}CDaa}>=O8Wz?BmY=*tS&rjB_pd_rl$!wt*4uOVjFQ@z@7g zuFRq|5o8^9%NSSqmNbZyTb&ePo%j#M7pY3eRJKf8in^ALvvOqu=f5?HkcN4P2+GkE za0poduwvD9%(C9!ERxhl!9k2E5953q+XgWjMK9NYavsEFEO;IDpvnK{Vil5x!H9J_3>=W9zW{NrzW0gFB@+SsuIMWRNAr)>ho@k+952(7h!45$03mV8|w`yB^CEQ7fb(X zo0wgmkK9ZA!OU#;jcpI01CT6dA!o0uxJluGa+R|wm?)%}2n`ua3TdI#3jkNtg=(pn z;T{fwgZL_>%lg7>=^V=(tRE{12VTLirfhi<0}nTGlaY(Y+c8Z)Ep;1$WKLC}S;S%E zdt(uP!_RTWu()Ev(u;Xp4~UD2q2PF7=i=D4YhHc}mQBIEVaGn`YRlQQWZuygyHGoD z$&!A`vUfh%AB1*D;iK5YwQ&xap>{}Na`+|F;kIq3bD*9A+bP%HJl!yDIAzo|0gqJh+tB;D+tE zvvW?t?Eu74c-Wh4wm*!~WN&uW&1u2%ys58{z{SzMZP+oFEhNz`YNWtgT7p_TG3*KL z1~wh}2=K(tpq{#qw$uaAo~k=%O}&SCQXcXJYf3dcXLHky_mnmFTG3!Vv|B;L$V8hQ z$=ekBPlBpzW3Mv5KcSp4s+1A)OVCI;1Sd`MFYT`Z7d{c25~X{GIcbseQ1ARLBx_k0F&`k zfT;0TT8}{0LHecmL&U2YrXXXGt`QZLJ!ZD0MFXn>WGda?c7St;Ir1(Nk1Eob5Vpqm zk%E-<$m~nCAs!dmXEx`oFseG=5Ew?3btKWtaVQO6%hEpX^9q=PZ0|E~`}#Tl4CoI6 zR!U#}cUY43d)2MAo6BR=b!WBf{n!JX(T*^j zidnulgE2$zaTcuFvEe~mY(-7txWuinBB3X(*eg^!XR*R5Q%YwD3>h-RBVeJY8w(kX zU$+CjJkT%{;tuLrp|u4GMW6#HmD}6fVf1XYJGm_6czRc)q}J?W>K0%ZP5Hs{X5aQ5 z&NvToRVbPw;O}xyIu@m@&RYN|Df&S=N5Mp2H04)KEMPo=$_Js5`zZaBhaHz4yWDlw z4)!YLT?EDaXI0H#=nV3oex2&e|r z`%}KoXdYuFlu&k_)LT-=%GB=61{O;K8Rdzr)EO z9FcA467is^lbmlwL&)TjGhb-LEiY17|03q`K>8nrFMkqYBu|W;RB=9GDrW9HlH;u| z1dx5Xvr(*mVPVUE&KMOrO{EypD3{e-!)ycECk3+huRwu}G36)+yAa*)dKyNip`ZNmAz(8TbH- zw25ybWF%tji7Jx*0}^Ts*_>#DYlp=oHwP95qdysQ3_4ie&nbDXO2dQw>BEElI4_Q8 z%Y%KJ5)(S+KuiAuaqLlGi0k|Bah7kfs1Lc6dudM9vrRK&y)D1^o#?2C=Rr*_C2t`( zJx_8-T!~7v11<+OzAknwFti!k1F#}c%D&|$_AASoBio*ZGVsS;a`>;FFy*^qod)pX z7}MD^ExI=x#r@(##FEwy_&%s}PIG;uOXMmJ#SC5q&HHMT-Hn-J)}@bXgAC}$-5QWD zU!bpbPBeeNidMly-aHf+^Wzu;h@S{WZl6E*v4w?0&{Pw#2JdgiTlN%<;qIZ8+#8lX zjY9KBaN~M(9!RoZvvsuSsbNxhouY|iNyKkqXQDag^!|?K=t?@PbI5ksex`uN*ER~2 zy@mX0zx!Q5D&#z?jr;hGbozON=Cjy!eTU7*tnymDUoG|0r=zq7WF;)fr$AubhCoyZ zkxqiKpb(xTz!(CJPJt_+L=(WEnwkiXbHE7PfAhImGhtRo;GV2*b2 zU!pOBtw_6RuPCY{NMBAo2yy};5($NaKmgGbD+J1%AvCQe0I()(X=Bg*AX27V`C$-D z72LAi&5}0=mto2j_;t;`fXNCz6I`+;C=#UH`c*;0xyl8h%0~{tmHE8tf-?Y$#gbz? zj>$CSbbcQD?p*ewCa-;U-`!8PHB{C=R=8*kfab69dsTSM`N*;h|0Q{Qe1MB8W*?JW zcwT(ULF$P;L^l~cD7E*QBupW1)5AdJkAjS=6voxp#m#@5eM{B+INx6Yg_&+dfr`rP zQ%GI(^~fOeiaa!A+RiY4SSJFGC?v%2<$J7&a!xVj$bQX|&~*M=axDVH*pkQ!ek!xn zt!1|DQB5;9ybStib8(YdPsjsOOuQ-gGgK7Sg3M(8)eZ!H__=gC!W-C?<@UlUE5gVH z>bkzYk^%d(anP+C*i@0=^YS+t+zOJskJ)s75U4Nh5ito=V1sfhu202f_Gh?( zUnV0l1Pm=i)XXucMwslS$$*y4!E|AM>n0MeJuo6fR5z1BlTyDj~roWKvsQ@h!!}k#@ z?Y~^^^k<_Xqv@`@hb!?3A3cLz`33<30fdEng9@?oOp(LY2Vdtn!hq*dB?Iw0R zely>_te|Pn{C8__Mq|e;??NzfbIMD%%y>?vI|NP~QabFk@LX6=vfuF{6*g>S4*me1 zDC#V)L1;s$3jA)xp!8X@eB^a3ti; zeMLa8)`i>Q7)!T|q>!O3W};zCdm`{H^jli@k9h7^)18Akp@FV<*_>h455B-NGK5NCI zVtujhWF92H3lA2FeBPFRD-+je=koY~RM=!Rr0La)*S@-nh8@Klq@l>iziSr`qDF9IodA zz=w!|ihvet9Q_DzB-#MGrK{M}iF_BzMk@6n7y$pDLRT$YiYx7T73^p z*gF!5{qLH5kk;a9CC$gcxpZvx$*9dGob9ks5T2uM)e}>z-u+MO@GuKK!~1LCMJ(9K z>?mPuM8}pkMhG@STX>toX_devQfw;lA$!#D2yuc`Ew;=9rrky1IjvzdlhQOcyGqot z<1Yej6C~)z>RW)8JX>-akH2y=VT>RX>scG8eoXsXT8P3biznzVUr0=UO%YC0$ z)g!HxL&O*FX|^2_u&js}=hOP7(@^FQ&+dPZ(`LB2erJo_iW@L4P^gX~r?{4vbdF&k z0URY;9i>-&|1kt~R6l1&I7lj~qODxD`aQUjCZVY*RIAE6Jntv8Q~99u{NNLq=a81T z@?t3GuPq46SOnLTnK5$AR)t&my|veGpww;U)uU(<#1^QADE+M{>L}5~3b_v`Jhr>s z#|_L9DLDj)66iFS>K?;H%NclA6c7z8gsCYIMxP@FOf+(~#wMS=K|g!T7Vbfb^JnuS z8Y58QYDDLEO(Cv%w~XI6PT#Sm5t)y}f5prl_$@i(eEp}^NYFAufV^?Y1NAPEnYMk$ zUF76vx8b?f$uq#pWOxVhK}XNO2lVwmV{tMmXv?%mb<1<N6zclV;p=yAvI4(wdj5xoc}vG)!F_!*v_i**R@um@!_JQO^E-@;Nr6 zKD@0P4-0ep3R&I?AkJ5t86%wXNF;X6`@EB%D8hf?Elr0#$aR1X4=aw%Aqbv=@EkJ# z)N?{`gll#G`_M3_B_^L-u*~|Obmcr`@VEfQz!h=%?3l2S?fjpDMkG)$wd*9-uE2?l<_0D zb#&`nQh@reVqOMST0%CZ;>V%uV|p&gi_=N? zrgk(jhgqMYmtuNkEk~nbu3WCE zI^3}(Jx0wUuXIpD3B%p$FLaEd6c}oRPDdffRfcBDfscR>7|UUsUSA)qJR;Z*iEQQ)v!3#f8 zzvhHA*U6$JDAIX+$Wot&@be+wi#~9lM@fnK%c~Y5NnNBk%?&*S^($j%h`F}d7KKDp z5MU>HlTu~@^KB}LYsbXD)r;sva}shta#Jl>23?+C)LF75lJKDDd6nLKOV}21G^#44 z-Z*LMQIb$JEkx}Z#)b)zsWC}PS6p#Z;-G#!L1aj85R)SpomWw+NImeKt{VFEUssE(aX6FLh=I) z0Xdd&0D@^eT%<9^|H2H0Wqz5UHxnFC5H6~&2t)OtS)H7XWCkohPEhGNJ- zpQRMU4FX*cH`;U=V1e@0f%`+=cay3QfLT+x>{&_MVR7h!v3o7()}mq$gkRW(Hpaxk zM$Qm?!FMN|f=r}wc?+fYl9H6Yyv9b3=w$379?WsW^; z&ynR1t3;XLRbN=iH3SQw768R=@KVd*p%3!VzPzLg_TYMsfs?!Nz>yVYm~FpfU4yam zu9@DL;TvAr*q70k!0Fv=CfZA0boFu;xZ`OuVhJqJGsc#*jVK)e(uVKh!U{r@5mJm7 z@-`*FlX#AM;cEbuSZ&FdeC2pLYJ`mRBL||a;ohwjSo?3>KPJRh%gEB!H>5z!`8c3> z#Ma5QYwFN_s}WmNE`ylKP^PorRQBzxknct8j5jkau8w1llr(j$J?RtgR}%L}ol=m3 zY#{kQ%6VN*mD&b3Wfhh9XQ|l~e`+Cu>|btyuq4G*TsGh%N9Yx+XY4Y5Q_NCm?Rd z;**c=+r)0|MO{HOg8-0WHX+dq?t$*8Jn2Xisxp0sD?e>sE5mI#33O%C-@&&{ zc0RrLE%O!fnb((GKr|}nYZ@!vTRz*W zCvGH^vQg7dH^$nAz^e-;U?q?ei%m|13I^j$C4O@9sH`!6+OBq`HWG2CYPvANvYfK8 zD0QWmb2U2dhw%Ty2R+uFr~_(Byhv+{5^9Q1I1Bi=spY!9LLvf(Eh3Hlt&NrN3!^#o z$4}oz0!Gljvz^r=F>UP}M!)n7;T^TtdI>Df#Yh}NJSMJ$D8e3 zVOsPb@tX$@1exak=;syT_MYDSr=hfYQV}C_HD2)Jrgyn(Gz(Uf|N6E?Ah?{7|hkfbmB1jkKLgQFQSd=Eeq}XosTP4iF(|Qpt^E8X+C{43!1j@wH*cNJ(wol`vNI``#SjVv2x^1G?P{0#F`G8-qC`!-WPp zMBf?_m3=@ficN_CHsvtgam3&Q6}}3|1WxqwGRn`E{Z)WRLr#OLPIz}NxjHsm4ed7C zh(4damKqsaV{)$E_+|xTJ_iMbFWvYAhEd%uS_#Ch4R0~-vo7+Cc!BxNOUE%Ys3-=S zI11(dHEY4lzNr>P!y~@Mq?`((u>GFJb2_`vh75?Znqrd~33Ye8Txc|pt~o$29Y+Si zl;^z+R0Hu~;zSSJ#k9oY-_BKcm~TDC zT;`&B-ip?a?!3reG8qNllwiQky*eyA!lj*;?@I^tgj)sldNodKBZ0i|D2i2To0opH?KUkwSU)I;sVRshHQWGd}WEvku z@D84ga&&Bevg?Q!7N=xBXB@vNj2r$Owi>*&Q0Salz8#)059Mz~1)f7$p?JuEH zOEufpao?565PaC%!rzZ@l5Xp0`Axp6zZ~*{9=jDhK+L%VI7^I~@y{e^R^|vt?d4C? zKlFl`Zu+0mr9h_FtNiVxS$pB)@=I|N@~WlBV(QJIMOnXnKSQZWU|zzJI;r6Lc8&B< zjOrmW7piT8?jY`#ZOIid6Z||<`}il0w7hfco0LDQLZb2{E}U$gdOo1K zAE6HJc16qvUq=rZEKnL)_cx|Pq%%i48Xs%DF8r*WTUKmIh8vY(r^Zm8tBllsp9Yew z&b#P%*9Gp2VxfJwQ1dV=V9v=hX%bdJSMHUaK9$Kj_)ZZYY34jVHUVSiRmShWHW4?n zNg#VXZ1h-yOOD)B6xBy0lkh0eQ%z6@0OS%&XITdw+iDco)xOgww>`Yui&StsFuEkR zgM1T_kapx`No~ibtU#S}i?GHHll1>ld!>B9Xe%u<|5DSEhR9&?D+)|p4oQ>_xXKx} zQ4+W3g3oGoBj@|KpMUnnE!6RG{@BW& z<^;;@HKavrZW%4%MxK9bi#5v=%*?Oi8}z|CQ=Z&p>4&n05TRghG&0YtUc_QHYgjBv zdbQZ@RV0sSs~#O<1dIP2?$-(rO0_Zue|0XSX~J%`bsNSa?-3x@sFGw)_q_%=)=$9- zyr+~^g(O5!%%~EahI*m~CPkeiTKx)oYN%^u-9@)&q(TaAa1Uc@3{yU18E&z^Ialn6gtKGR;&;;xwuf+|SXT&* z?J(N|4r1LazM|5wl*v+n?W?id56xe=GawdMk<$C-G@#kNF)}pF&I~MMjE$`K3~{u@ z;WSrMd}?@QNz0izOjjS#^vm|dW~tJ3@Hz0KSWKGderv7fhjRXPCWxXHGg;j|ICr1e zwY_v>Hbg?$(?+rDSDf~lv`nX&qMs!O{or?HNie;qhJY@4iNx$irt2Q8kLI*I?Sw(r zuXtngvT+L2CAHq*Mr=$7`Ii?N71`G5)NdRk&c8pc$J44Fv8YPPNyuLKl z*ab3%#WqLMXXl~9ORT_nmT^vFy^j5-$8cF;rAKSa8 z1ZE0A$?iot?rQw+L6peof9{))`R!F}++uk8?^v?rJ) z^y&U9dYU3~Ho8oXtkxDB!L~n)I1g4^SIVALuR+J}6b{yZ7{dQ0Y8wXQ7b*rl>%MaC z=d|{TG5eA?*Fg(B4t6DQUF2A;Ey< zdMD5YaeVyL7F@ZM&O+K9{td}3;voe;t@B%Lg+PriZfv>LVx~QnR%rAsEqH1HL~F1y zg7gir4`^FFbEoJDRRnnERS5RE#rTgA!D2h`C*T~~@%7;GRE#|o%~3AmR?s(xlz0oC z+PPNJv(Z&IWu%of^VR&qG?(lj6mcq12rIn0fn#k79@0X|ysR05*;3_psoS_u%ee+<-0bDC2bfexGx%aKbEGBJi*}xL`ME#9luNDJCKV_ z&D^v1GKV`hkW3#jVD86oX}GhshJBKfmi=LOHAxCQFVMF|+Qrfe1NFh7%$+)DO_a3s zsGmhHj_GLdL}F|`Y8v_Hcd zGm|#$&y`^R zlZ8@G45xAjxb}gf6ao>#td)J?tb&03Wp-~j1_@cu9)?Y}a|+hqzkt%ZHC@kt?XbRM zPJU3+*Dn(peCZDDAW|uFh`J`Zm;zzmg;`d0Wj3jErsthpw^Ile!^09Im@Yo0W;H zi{9>6aLYi~3=w{FBVshu@ou@{aONO3v`-wQ@T28X2@|ge`w{iameqo!6?nQUFh_xo z7c5%Ey6SVEjSI^(jU{DZpfZ$VS$J)Q59-zKo#Cw-_sTWQsVZEV&d}+~c>JQP*~K`3 zpBB3k^x7s>=b=RN^m$#$Z6_{N4r67G!zfeJmA*SAh8vV3g9Lkwxc7GL7CRDZVw#GC zGU3%1Ac}wbsE6mIz0lfixg_I{TZrjF9nY4WhwHjq_c8OA(duCPF1E{qZ;XXkl{auj zcTe75FN1X7J^K&6oVqn&x?3}SavH1V$KAJX?0gu?j4zuJP%QR_Y7F4P8rr}oQ$fve zV*P)XnNA$nSsl6$nTSR%qhAHHQ^PZQmuYGY&)beFw1$sXmJpIJ5d})g@I0^fgv+O? zGvj`SU>?U%1oxdLiB5q82<<6jd0jS`kH_57-DiY3;kC6%i=snq@cKya{1VIJCT>Z1 zf;}RhrcvMSA@dT63P8Lol?&_elUM0rDh82(Q5{W%NPoDy=FRVRAIzJw6&hjPe&XE)uB7`1FaJ6*|M} z3|D0mPSR`t_-;Hz_XoCIk6%*qRTnS1T`di9;pvLym8n^SL^?0=$*s+p%d(iy$6LSu}PcsD;wsjX2IUdaPzT$@Akg^Tt%oy&(>|-B+cr z9z?Lz;%2bB6wDyjBuH|?*Qf&YJI&HYUbh$;I3x@bl=!jHx)#{D2buYdy*n1Z_Iu^X zk}y`$nc_0BfQq>jFLi6cYIufWsI?D*G?whdG+mCY8Sq@mdD=Pk{m8BWYM zE5Xmj49p)C7GZ(>aZsCk!xJ2MqM|=qA~mfqzu|U`hm=e`Fr( zKq#)TBW^p`dM$;R2+1{LB0olB0RROtPNKq7YFN-up~qN*!^R=SCXy#V z$0a<&`|Eg>8APRagy)8dngl?J2AdtThYRPDH>ECHAuw)JDQyoCSpf&OB0-f zEJ9O4sC~$h7u4j0^LY(gU(l_LX*mYbi+2s$f2q)StYo)}ZuT9+`OgJI(m;uuP zGK(~Rnn2)D&Jly?DgmSvxC@Y3ns+Cf;9j7o+E)GjE#EY79fAo-3ScFnd2X(~9d z<1LC=4ToVzOlH3R0{@6;({FRYi(S-0#w)peU9fJ40?sZVEudw8Kiq8ZZu2(g5zIPRc@H2gUzx)r5GM;_@%K%C**$OtLq zbj-q-ml_I1S=p^41g-4ee6g76n@iHEXR`aYsulwM8nU+t|6&FcR!hRb( zYyT&3uggEs-pd?(jMz0$it0m9ic8opUpTq{3-dp%0Kx#l#>1BVe*~yZ@&5tBnBxC2XfVb9rMod^ zitsT&D4;ObcR;^BsAA(`!3ilVD`;^4hySs5LB9P>9zw!GLPAPk>)#|JB=p6w5E2&> zvJeun_{S6eDj@fFPee#a>hJc8{?F|f^;cBH;V)nNZzlRrmDt~1GsS=UzltdRV@m#I z{#RB1^a%+`$h#ZcE8Bjp-v16uihc>v+Wa-`ixw7E{>L@>oBuWRf2M!+YunknXlt-p z8f&QiW843&kL#~I=NBy|{?CXnqyG~B@_(7fS1xRA%*@94CCjMEqReI@@gMTuf8{-e z{+|4oek~zkkN-IDFX8`epZ~N+>#wT+W{>D!+Ei28Bb5S`szE|^eU2y?kjwz!6i1)yjkYBxt98ZiRKSR)~)h1_|7w3e3s1b?bC zd)nAK#haNoZ$7d+rFjoM46r#tjnDw7Sq}i#d0rr31H(4+0s83M0UUb5M-EwZ5Nk%Q zl!oL}S*)8r_<^bFs6!pTtMEf5XG!gEvuX2IKn)B5I{@@dyAP4SSw5{^7E-CVbTz64 z6U&NF8*A+UH_25xsqtk^0Ar&%&F8ld7uDkEOC5=wlGw~4(KXSY(u~GL84WYXrYX^l zL_?x0@tps9&PGDr3vk3flm1JY!!{FfWuviErjB=E#mnk+_jq#d3O_Bb4pCez%Gcd$ j+gTPxXHk~ list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = PythonTransformLauncher(Web2ParquetPythonTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + transform_config = {"web2parquet_urls": 'https://thealliance.ai/', + "web2parquet_depth": 1, + "web2parquet_downloads": 1} + fixtures.append( + ( + launcher, + transform_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures From 3e05f3049ea1b993049219cb0f2f5687fb5451b9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 12 Nov 2024 21:20:20 -0500 Subject: [PATCH 04/24] identified current requirements for web2parquet module Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 transforms/universal/web2parquet/requirements.txt diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt new file mode 100644 index 000000000..0466531f0 --- /dev/null +++ b/transforms/universal/web2parquet/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev2 +data_prep_connector==0.2.3.dev1 \ No newline at end of file From 5710653d86e2fc49e0c6f0676216cd2c801eac3c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 12 Nov 2024 21:21:54 -0500 Subject: [PATCH 05/24] relaxed dependencies Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 0466531f0..5c989591d 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 -data_prep_connector==0.2.3.dev1 \ No newline at end of file +data-prep-toolkit>=0.2.2.dev2 +data_prep_connector>=0.2.3.dev0 \ No newline at end of file From 80e4ebe7dc1a425ac74b74dd6eb959400b733e45 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 13 Nov 2024 13:02:28 -0500 Subject: [PATCH 06/24] added build target Signed-off-by: Maroun Touma --- transforms/.make.modules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transforms/.make.modules b/transforms/.make.modules index 0dde12b53..31e9121b0 100644 --- a/transforms/.make.modules +++ b/transforms/.make.modules @@ -39,6 +39,9 @@ test-image:: image .transforms.test-image-help .defaults.test-image-pytest .tran set-versions:: +## We need to think how we want to do this going forward +build:: + build-lib-wheel: make -C $(REPOROOT)/data-processing-lib build-pkg-dist From 4dcebb60344eb3828c9d48eb4e62624a7781703d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 13 Nov 2024 20:11:24 -0500 Subject: [PATCH 07/24] added licence block Signed-off-by: Maroun Touma --- .../universal/web2parquet/dpk_web2parquet/utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py index bbf9101e3..5a7fc9cb9 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/utils.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from datetime import datetime def get_file_info(headers, url): From d2404f4b09e64f131c35a63103a425f825942a89 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 14 Nov 2024 07:37:07 -0500 Subject: [PATCH 08/24] fix filename issue Signed-off-by: Maroun Touma --- .../web2parquet/dpk_web2parquet/config.py | 6 +--- .../web2parquet/dpk_web2parquet/local.py | 1 - .../dpk_web2parquet/local_python.py | 2 +- .../dpk_web2parquet/python_runtime.py | 12 ++++++++ .../web2parquet/dpk_web2parquet/transform.py | 19 +++--------- .../web2parquet/dpk_web2parquet/utils.py | 29 ++++++++++++------ .../test-data/expected/metadata.json | 16 +++++----- .../test-data/expected/test.parquet | Bin 32039 -> 32718 bytes 8 files changed, 46 insertions(+), 39 deletions(-) diff --git a/transforms/universal/web2parquet/dpk_web2parquet/config.py b/transforms/universal/web2parquet/dpk_web2parquet/config.py index b393de3b0..16584cb57 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/config.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/config.py @@ -10,12 +10,8 @@ # limitations under the License. ################################################################################ -import time -import sys from argparse import ArgumentParser, Namespace -from typing import Any -import pyarrow as pa from data_processing.transform import TransformConfiguration from data_processing.utils import CLIArgumentProvider from data_processing.utils import get_logger @@ -58,7 +54,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: help="maxumum number of downloaded URLs", ) parser.add_argument(f"--{folder_cli_param}", type=str, default=None, - help="Folder wher to store downloaded files", + help="Folder where to store downloaded files", ) parser.add_argument(f"--{urls_cli_param}", type=str, default=None, help="List of Seed URLs for the crawler", diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local.py b/transforms/universal/web2parquet/dpk_web2parquet/local.py index fea14b457..cc0b8956d 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local.py @@ -10,7 +10,6 @@ # limitations under the License. ################################################################################ -import os from dpk_web2parquet.transform import Web2Parquet diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py index b6764015b..735f0eb02 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py @@ -20,7 +20,7 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py index 535a74ca4..6b2acdfc5 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import time from data_processing.runtime.pure_python import PythonTransformLauncher diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py index 012460443..5cd402fa2 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/transform.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -17,7 +17,9 @@ from data_processing.data_access import DataAccessLocal from data_processing.transform import AbstractTableTransform from data_processing.utils import get_logger -from dpk_connector import crawl, shutdown +from dpk_connector import crawl +from dpk_web2parquet.utils import * + user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" @@ -62,17 +64,8 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None: Callback function called when a page has been downloaded. You have access to the request URL, response body and headers. """ - doc={} + doc=get_file_info(url, headers) doc['url'] = url -# doc['file_size'] = int(headers.get('Content-Length', 0)) # Default to 0 if not found - doc['content_type']=headers.get('Content-Type') - try: - filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') - except: - url_split=url.split('/') - filename = url_split[-1] if not url.endswith('/') else url_split[-2] - filename = filename.replace('.','_')+"-"+doc['content_type'].split(';')[0].replace("/", ".") - doc['filename']=filename doc['contents'] = body logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}") @@ -99,12 +92,8 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p allow_mime_types=self.allow_mime_types ) # blocking call - # Shutdown all crawls - # Check with @Matsubara-san as this is preventing us from calling the transfrom method a second time. - # shutdown() end_time = time.time() -# logger.debug(f"Way After: {self.docs}") table = pa.Table.from_pylist(self.docs) metadata = { "count": len(self.docs), diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py index 5a7fc9cb9..8214cc817 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/utils.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -10,18 +10,29 @@ # limitations under the License. ################################################################################ -from datetime import datetime -def get_file_info(headers, url): - # Extract file size - file_size = int(headers.get('Content-Length', 0)) # Default to 0 if not found - content_type = headers.get('Content-Type') +from urllib.parse import urlparse + +def get_file_info(url: str, headers: dict=None): + try: + file_size = int(headers['Content-Length']) + except: + file_size=0 + try: + content_type=headers.get('Content-Type') + except: + content_type='text/html' + + url_parse=urlparse(url) try: filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') except: - url_split=url.split('/') - filename = url_split[-1] if not url.endswith('/') else url_split[-2] - filename = filename.replace('.','_')+"-"+content_type.replace("/", ".") + filename='-'.join(url_parse.path.strip('/').split('/')) + # Prepend host name + filename=url_parse.netloc.replace('.',"_")+'_'+filename + + # append extension using content type + filename = filename+"_"+content_type.split(';')[0].replace("/", ".") + return {'filename':filename, 'content_type': content_type, 'file_size': file_size} - return filename, content_type, file_size diff --git a/transforms/universal/web2parquet/test-data/expected/metadata.json b/transforms/universal/web2parquet/test-data/expected/metadata.json index dd65c2493..a2a9db309 100644 --- a/transforms/universal/web2parquet/test-data/expected/metadata.json +++ b/transforms/universal/web2parquet/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "web2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-12 20:39:44", - "end_time": "2024-11-12 20:39:45", + "start_time": "2024-11-14 07:31:14", + "end_time": "2024-11-14 07:31:14", "status": "success" }, "code": { @@ -28,18 +28,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 13.7, + "cpus": 21.1, "gpus": 0, - "memory": 14.5, + "memory": 13.62, "object_store": 0, - "execution time, min": 0.016 + "execution time, min": 0.01 }, "job_output_stats": { "source_files": 1, "source_size": 485, "result_files": 1, - "result_size": 32039, - "processing_time": 0.94, + "result_size": 32718, + "processing_time": 0.617, "count": 1, "requested_seeds": 1, "requested_depth": 1, @@ -52,7 +52,7 @@ "type": "path" }, "target": { - "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/output", + "name": "/Users/touma/data-prep-kit/transforms/universal/web2parquet/test-data/output", "type": "path" } } \ No newline at end of file diff --git a/transforms/universal/web2parquet/test-data/expected/test.parquet b/transforms/universal/web2parquet/test-data/expected/test.parquet index 46dcd98151b8f8b7a1e17974a14569e978aaab0d..49a48ae57f6c8dcbc30aac6f1d8190b7a24d2348 100644 GIT binary patch literal 32718 zcmbTcbyQnn*C&d*yAvR|L(l@n2^3nO6!+i|+zQ2Ai+fw#-K9W*mQdU+xEG3Rp}X_mn2tLCr2SiB0)mpXyJbZ(Emb0qDDeO z^0c$IaB{M@aE4f$S=gJId0P8=^4WPpoyd{MNyvdbh~E&AV~}Iei6U`O|HqrD{__%o z#{VFQt%`(%f;d4zMn*wmt3;tA$Dr90dvSL4^21LPDI2ldJt73ZUCWLZbSo0EEE&{{(&kfY@2Mdsur)dwJUM zivKtAQ~jrVss7{2e=mJ0#(znV2XS%sw08D1^L+36KQ(ZNOp%Aec810li}p_k|MA)X zA5{Na4G@tt{~sE_l0$I!zyAJjX{oUO-}Ql`)}-fh)qY1_P^|l{m;H; z9`=6!Q@nO$iWOwGQ7pDZEV}=C_Zwcc=g1$`{BK87OR7cx=tM{|xz`p7@_h z{kJl&4A7IOt)Twza_67aqayuvM@b9}7sgYPeU-W=yeWh?;My2Z4LdWaSzos4UNJCi zyzz15waPr6;;15iX1w2iR(Pg5?{zV`Suj)Mv{vI(&D~iJ&|6Sb5&O8$!0Q(;90r`AlGXNu+qmE+HaA4R9sM`5^Jw05x!2`uCZ zTnWWEITO`ZN)?J|1KF6N$X~o1c_arnx5Q=|{-CP0g5rAG&343qlUHadSp4x3IJdzS z7Nfosi*j;c$JbNx?`EruS=eEod-6Z6Nt8GexJpXF$nh=mLfXIb3RTLeE7;IbL{b$P z3nLsTz^{oofczY@$f3=7!Fq!$^rsZj<<$J zsl`2&DxY#jafr$B2Nqu>?(}gNer3C~=ZeNyulCAGwAG#dnxA~gLN%dlb(AYI(D#7` zuPG&dysUyoQ9zU`SVH*bNd9#a+fRFKC4dHc!e?wbR`7u_o2sW=0jCyG-x;g)w84Bh zjY^ONgY{>TPn99&`pu0|&V9p@jo&TUXT-@sCFO2A)4}9tp{gj!g$hyFL_s>P5)1)t z%y@LT*3L?4a)Xi=o*AV%R$*9!Vn^5c^M(n=xH;Nr$T4)7mjZ8805qC&LK2Fp5kISm z`doSZ-cdBM;4@RiBD0}2v76$z}grI{TZ(3oRN!6HiOd8NXc zN|kC!lQ=1yHiZ-6PgWAx9e>E&+r?JIwE zEc~)0Up$1_Sxd1sq7j||d@G{LVW+FTH56N+RWj=YNG283D$e;xs+2CHNtXJ785czF zpDvbo_8Oxw1 z-Bid|ZkLS*yta@ez!qsN1g6OtD{`rfM~|y1bCuXMfnEcD{UCccoh-XoOjO=M)VFBP z)Mve_do3$K<$o~DAo*b)=Zlw$?tHQcH)JfSL1>6dRA;eX+WNS`XL#+`z)r!Df{%bf z@YN8i{t!%VPKp8`eSt(K!u~6Uq6A;_r)?|)L7^HmHBw0NailO4r${AMaR|+naJDiF ztw%PHcrLh!GB5OdpKb7)fp+ z9xR35EK8XdpYf9;>|A&tL}V7#PhJGh1J-3i?&2R$2NJM2_F&nM1B+C75cw^CJZq-lxFn5nVL4 zQIJNtt2knrgI%aDEVscQ^*#7K*^RrV`)iUks~85vH5$!XosnVyg+p5QJ2DL}Ir*|} zI}VPNa*1C9C_izO9dv%hGn1B-+Y7l%`U$>E3lGWRR2r|&Qb!A07c^gNFlUGm*ABzN zKpnewWh)`(AW@&T7sook5%Nto=A&{2@94il_uE6w}UpN`m=*!cg~{1h%!a;HHde% zbMfpOYN}^74t7mMu&ORdmqoizU=T~6uES10*_w&bx|>UNDWy=2RTU(M_RaMhGA11n zQeB(~_b(nyx__r=`GhT5KV?(wrtvAX9rcM3Z1v68vys@zRL+FRGUMGl`epQcF^#FJ zvB}mrWRtL_E9q)B%QEjH1KsI{^&t!x=(FP5DL|!?adyMKFiu1FF?Pe!uo$v;Hmq)- zx9Cc^vx)?h<N%{g^a1ciB8m(Ng&K&o&yd_D(qe=WZTxMyE|vplS+G)v_R; zNGCNV-HKvYoz0I5Od`!(jz$jp@=F{DTFBc8&$jv=!;DArn(SR%$W9@RO3IFs2;+sk zk}4Y><=h9bgJ6;|Ics($RwvT0WK7LrH=0;nVuTyGCkuFJCC+GS5~by(8vpYTgqx_T zW?+RbJPBsQU_ZO>rQ7K~-r!z9_y(!580%S8-3KX1Aeu^mv3P{x)Hy;a-pMst*4pc8 zlBy2td(i#VpB0J8gMql2tpTba)j^{8^Ufqv#nPvlsL1A#EX;KC=|B&%&D(xANLFFWpv;@+>9QAR>iJL#|AM*qN*}0k+8Dhaa3}}6v?55AzvaS)Jas7VHMynT-y%9x9@nsA%%{v;UZ>}2%?PWNzRWu?fpEv5Kgdc#o?fByS zZ00yX`^7nlUgvgPYW;G&rI(ZT@(I=Xb`^G67{u}aE5yu^2K5=E?}LRsv>2l&{9YcS zPGl9yOfo#kmlJ-UC%g`Ee!2>|dAAQD3ljYi8|1-2vxa>sZ^8k)1qz-vM5K$ef z$ur&OD)1>TR$DnDJcH~V9;SK`If(;NgDx^)o<=qdD;mKk78kf2fD(>F%w?w-4A_)M zRfxbMVdKEZ5daBT<(r41#bDxLlA~MLu+sm^eis(qf}t7Qf-wo$3XP#7nn@NQj=-|a z!c4p1y_j+faX|!{6ki^=k^>u4xpt|bK-XHZU@%;L+TK|G;ysW|_-8~PsH!qshZyMI zoa_n=!%lv)ounwPK8;2OI77nOvP57HxA`BaOnNL#`8H*-Ie-g$#O6Sb3jXfus-^KZ zmd&0O%rVPBCX!wq4Jgkpj2e%OWloo2jwmcmVOBo;09Lat*5+_1R#B!1$hSe&;GA}d z1*-OoCwwosL|K&>8``nVA%9!3q|;=;tVRt!^enbhqj{(HZVVznQ$U8nmye81BKZRJ z2ExRcfXL<=tYt$?^z4RdIqAlQILch8ssZ_>)CO!46%xNt|AD5NLdv9}nEW9j?oX++ z?~%zwQbX(nD8ZtMECnRMW)#UU;Lzw_IvR5Tlu*t|IbR~ia5G|Ya~cdyIsl+CR90%L zaV8uLmLW3Gz$8&h`lQZAM1;ptQOW$bop*wbg+4KAK#w(suNaj{^&+ShHWVcW3(K@_ zxXovIJt*M=v)Vf!@oT?>{23C!5Q;Zm*}o-uFL>(Op7wa3ZNQ+g^wEv5gH2J5C`ZVZO%1$WFA8~z2<^ck;LZh{3 z_h&jp$b@a0hwyP!;KdF!-YjoZL$cW6FWPrJMh@JvIiN(yefi?me_>LV5JDJ((K6)^ zW>#39tR`6!P$-5jGAyn#yU?bQ$e{9%rZTI@jJzP7W@BR((LkSZ{?AeC*VBk}pC+d@ zBC4=3K7~vx4-1pioYfwqE-3ZT(t7k&B#p+2Wx&IH+gYC0=ZYkfWE0E5l|x~zqj_a6 zo?k39v-vGM zRa-V;24RXmf>IVr2Z_cCn)0bA@DEU`NM@;}5CJp!j5-wommOl4<0#5uwin99i46M# z#tKDVC`u8a)4A*ivcwHL$RHrK#6Mg;DRB!Slk4nQELFh8UO^5sIe*Ig?M&C zbh3b7<(E3i7{udml|Z2pSmv4>!2-b;NLaEMqD03ng9$xZowwKXEpy&|kETxc-nfCE zjEJ9g*CHQ1C)Z#jnMT>I5hoA&aCyDb?v=Q>WqACYQo_haMC)V#V_oYc)(5L9;c^%&+*ai`@Vp$i(3!lDC;+Kwa_nmT|! z1ny`uO{hOy2h6vQbqnn-6zJXcye);Ua?f~Q2k34!zM8B}{DLxoT$HS)hLF=)L@eTY zI8BgFUy+clsX`N@I5gBeUIfi5cAou3wmJYnrI~c8&ruxuFNf~rB;$l;lWA@%f>DGy z^KC-qlb=$6imVbEIeOQ!&AUm%s3OBz{58O21ukLMsR%U>?NQd8J+gOc z%IVs^06F&LZEyltO}?t!RAI`2JmY6hKMphpPvy!uOyX%_CBhP#SOOjY%&KZMzf4$oK?)x*7EDAp#$GewaSCj_ARmbWREt-|3m zA{jUoBN8XJQqne<;i6!^xKMRkRVj4`bdf#H!fS(S^IbMMI&5Z$JruECE4W?yN+oX8 z{X4ZDf1+7jkddIwz#oR`Y+^JCxxsy3!ke}2h94h8su(XaKM1&VEKm)a=KV^#Ox9G= z7(+y!42d=dKh~@|X>7ho!C(cD7*(xNk?<3nuo8h8Ws1;5sCbryNDB6-Dzzri9ciEs zM9m&Msw7D=0|Z#|nm8ii@3C;(DQV!N5ywXvA`mB!&5z-ad%F#HAO1Fpu#jd#7<%tT zjOe>0##NBH;tT}o)Tb0a{YrX$T@Kky2^SbcdtZ7WI98cMAd;PAs)G+qR-}0+Nk7)C z`OjvNkn@>}oZ%KPCP7~g>t4R%z1^fz1End%COtcZDSvAjQc5N*C7WN0sJh;Xq*`So zN+O+0CQghLF7BTP18F<*f=QlNp@oI^$%Mr5_!tKz7IOKKmdS5RXnBC`CnDk$y$GH< zc75>U-T1+3%T+m6O1$ibR00#Ar8?Xg|h^>8X*Vu?bh7~5%*ExsTYdW(SI3B{o_nEx1+FVbom(hpZmL@i9_W>RIU zz@z4}19Mo}5jYX%&{I)sAhL;Jm$`Iwekj5ySxKs|vr0YLl;oOFY~lnVL^_jMn)!C8 z?a3SzKMY5K)L>o8jPI4jX!tg|^~_*U0kRx=EIF;NHj1?kx={b9J$tGuq((uZCK||& zh3KtVOzA%e$H~&*q@-Yr+ZMPH_lfwZNz&N7Cb=9DWXDAjnaR^c4PoZN%Aue!-s~Tk zOamg^<=9H7z?E!VK8$nad4IkhIwuuii4cB=C^l_m=GnPFNQTH(V)Avg(TNt+UXQ*iCrQ?Vhb zgJD=)cA5fmVJSSsXdjE^2ef<%US%Adg8{LW1?Hj3kvIv;AzKoO1@^jWdC3m3`3Rgq zE&c(-4FW{qP5y!bqCT<}E0#}2lX2u<=3m|pS<537nLhx45F#KhU%PBUuslR7Sl~cE zdLka^o)J+>rj_=gkd#TA1f7%vL_=2C^xU3Crc}n19MX+RkFGR9$~(>?#~g;nplQd! zBl-JF%ZidS3oAVxp}&;N?*?)4)}rV0SGkFw#A#CP*%E9CDRZ;2MF>G99P+R^M_rAb zf96a2=*b2GmJ=0h*6&wKjHpPy-9gmsY(JQ}9YpA4r(7+C2YG;++rO=Zk^zDRf}HvK zh4UvO;j;KTi1ry8c8pCh7m7WblpTg07*sl=Sc*9ws@>|h1zB79zPEH7UUIn_50=JItYA?MFVcKtLNXL4~8 zDTqhW_e?yGh*J)Mz^2J~5!c8*AoD7ki+&d72BTnLlTdPj?W8vPwtI|@l(jVn80s-e zg%CiHCIOew(sMpbzA?^NMRX?Zvkx&a$CZix%@tA@8g@cNC|>e|i+A!-ISo#Pd;tIj zhYiFRtyy4!9L8K=5wcbH*kM!RV2{WQ0uh|YS~5)9C?UlxeaO^6JK11PJZj0ZB_-zp zIT8cA@yZx=Ab7I8{y@`>3FR$X8agSEj)co{0Cpx#sE5vSt0d`ZQ1-RU{mQqL#a*ZUXdWTEC)b} z!n_GW1hA8$3WxEF)^hw={D}w=BtcO7c#IYbTT~dXX z5}%z^YQ(>s2Vx%iz9CL3AI6O0IQfu`x|cMEQmn!dET=V~KGg_Dk$Pvv%Z4gKG?qd{ zhNvjM$8bc)hU5crbH=UyDJyBn-Z+bVB_eiEu9PFg#(hXlJ{Xa|35AtXF~SEGHXCSr zz^pN!9a2We*<%H;I4FMo+iN~YI>-`9QHWnTo_4X08pE^L`bnPcWKk_ zAE+^T$}D;i6tZK9z`eEbByp$aYGG;vkx5YIpbm0BvoDYAe7-&3i!U7VB9c16U|$~B zjd^Adk%E#8M7l^iM^_3BpLPufy7#4ptmlRqAhW~Jn2FIzv55`@LWb01a4Ge(kebb7 zk9Uh9ZgG}HW+`QN?o6oENgNs?; z65Tpmb5p+@v0dpSvYP_Bwgb6humONI3lJ)|B`b$A)z|c|E0a`@)b~B!1PtUFC~kKR zVciH0Zi<$aCDj3hgl-@c>z~D|1Ef zT^>`cLZ&hO8Yk!DPw~`ZmH|v+oqu~*xH38k6(1NRsp$GGqDd@VliOnNbz>ogRg;3S zDhPn?Lk|d+jq(|bcPnD5;NZwXk6O%8m`|*x0S^c?xdBy$=Ku1AD!yisM+Zml>idq5 zYN+tdMzC;m98RhyqrEEZ)0AClNjy#oDO8f5M#0Km8gRDZQ7z;Qjr<;pewHnl4`8A# zcLUN;@kjwg9O$U!m~BM>np*YP6m;SOLBiijdAmUA9||jz ziB!V+-9QL*B=WC`kq{kfDNh2V`3O!SG4BeV^ny9>D3g(q`ZY&A+0|H!aDw)VTAHfa zA_hb35EYwijfu`O68Y?Ftrm4$npb)hdto4=rr(6JUka3UHD4&;5CR20+bzMS5V<+K z^iS4XG(kFX4{Z#ix#Ij%2M$=?0XndR-LToAd=Lc(ESZRC=!!6uN#$RC_uHUJP2mn;HFo4Q#tfhZOQ3!S8uWFI|C%0>jBfXNF4Q}@)So=qmf!cj6*1%K0& zyY07`q^99j8h@+H%hW$w$%!F8CFG<(2kx9_0oWkNNJ18I*VI|EwSqh4U!dh^R#h#sYZiwCRnqTlL~V>g4*Hl(MnV){3)r{3bC{2&g*#j*C} zI-Jww|Z+f03Qv(9CFj!$=OBaSn39sOUn{;eH*|J*RW0iUm({No4l z_JWVoY`cWj5IKA58XHH|Fs={U?mj^mX_nxN`sue{1zTGo4~Z=SD+{#=>#Z;~r?yj* z++JuhdE+U3ux9LNQl!5v(a7!N?GktV*Rebg922vFfC(H=y#qgkCeffDW^n&*#1C4b zzvmcT#PpU$5ve~>Z@JYOeoyY+yXLbn-_tFxJJfw*95T9mR^u^$vNog-=RW2H%aYF< z7CZ#pERRFmUc4~f2X&g=RoBlS9y=9r`+Kxk8x-~KdbzBR=!Yd|8iaKV^mMiJ^#sD& z7+*@l6DN*aw+Dvix4BP6p04kf$65s!7J7M~;(B-jVb*6YE$zoGG9Tb;j$LbkHD}k2 zx5KdZsg}TpfZKbX+O|Lf2z+mTzWaB#%phFfQMi4oyZ`YxvQy94^9XRH&^FQS*9-k9G6+4nmS`Po5OO>^Nndd3ZMZnHKA96-xvd`cweSys zWqfIoXshWaSv`H>=;YefE#vVpc6dzj$G8LyJ50QNXn8*XEvMgh@7)?7$rlZz{nKI+H}Xbe5E__} z_@`&ayDg!+AOE7Kl&>dXswI0f@U2ZZZ|C5;^Kdm(MeoUs-x22-8eW^&5K(uwBy%wQ zRvbO_>$RivCQjV^Op+l10{Cuou6MZT0HDM zY3bQ>=}y0eQn!lUE-LFrx=6J8!4^di;~qxSUv{?EeRx*(4zOLE=M-ySsU5WpKf01= zU9VkTz6?n`d<5_*(&5{_1@UZWyy%;%6c8= z!`E}8mrspB*Uu3(>GUfHb+sJzVspI3{B--!H)LtNcX-a!J(({t)J}-DR{$1Nob9M7Hnt^2x6% zY(=zD=5E>BF@9Xs-SZCDU|pqsW%=*O{idj6{ApW@5A(`x8Cf1rq{y`4QlM zR5RfPgVOwY^nP&wWpV6neZ$hkAOEo-pjryEs25s2(Rc7LoBb$r+8AHRP*o1=V+eW-aHS+F-cZE1$SS$&%ES$@dk|2FS~ zL)ofF_jecmJn#Rfv*j$XsV8B5#SsYsb6(J=sF9ecN1=rQW0C4E_%Qi;Z#rB3x?65r zuHiIoLA<^94_|%fpvLN(Plka(kN0)!X0B#+?cIK#7N(C}S`svZvNPc>_ah_FqaSS@ zosXripN{yQcA+&rU%y4XHrc-RJeXVC{TdK|T_Zf-Iwj&9e_!Pc=XGWIdQ7>3ncntz zVXg5VQnxk~XLNe~&_idoaCijm2>92u_4=Bzz>B;Bt6F&Ud+_amcCQKX4?6yW_N>V8 zx7B8z8o7R1aH&Igc64z%?AmHvFg5+UGEx^-)u!atz9yWD#@V#qz{!^`)bKKx{Y4_h* zs7FNio#V>T9q$WhSNl9p?tP#Y6gT&x({}f`>~G+V>EhQ7d0o>%-(En3!HXRz8?b*eBAQ#-A6t>Ja0rLSu}r{s_4 z=KK4j_o}fiFnjyQL$`15`xcY#-!(scl)hHL15Ke_75rgU3mrF-X0PgAR`Fk=w<$sQ z?_19=MqY0=&aCYnW{B(t%&#rtuIQSTxf%GIZ9BN|yX3aDTHj*G>X0tSCVRx1XHE7p z6LIm2ef~x(jgt9ref#O{LB1QcRRX+@{qGF~dxP6}VY#(?zAgS5t-YoOQtiyt)#G)F zeQXlyQ%Bfy6WG=X9c6E~=0`E}XSv#&9~z9;`4>6pXaI6D#jt2FE9<|B8mBvRN$4I( zl_C-qX3sEu@GU;z`5O9RxcT-6v<$kQX|#v4a2|`B*`B<%Nba(FZbWp8%Ih}^2th{G za57g14UMnGjnhN6LIjC7g(30-sBsfCKmq;y82otbSZyk536I8a4Ag!QbtwV16+s(y zJ73qEQaKb<2c(8Cpica+$6<0tG5V4ry?B<&w*J1RFQeunq^awAzOlF zSpQU}g_s>7xXzOh{P);U6bHJ0--Z8pra=fc`5glv&ocZ31U5%1_@~gp?*kb4grgny zN#DLvBUT5$d-8F`3Z5gRv$On1=id+t3nM;;(?lme26GS%5{e8#aR7`dA}cA)^k?ON zLp(jS5c3~x!2*j=G`y_yZ^VR7u5{Q}leoU-s58OASggr6A^0?0n#hD-c@?BSkUQv(TCs_m3L7B^5&LOWv4(&Qi zJq!y=(I5I!XvOv;uT|}~OmCC*U*@j0C^{p4&%cZOFPv+R#h%s|?*u*9?x=)}BmdU$ zhF70ID#K_#9Yf9=p8St2xlgX|A&;#LM%_W#kci2S^&s8F3<&qhjl_`${b-Ft}x93wZnZI8pc4_AA>7LEZEIru)!7gzl%40dxrK@^rDFmE9d=tM-2W?HF&v~*Dkx#?UP3Eb)khrNg>apm0{T4 zit_yO=!j`}H6#4#Qp_r0^hGDs8ND;;B%@^>_Ti5atrM<==|Ey8)0d4Qg>+^1%E;m z)qw}y>+8<+z0-z{kgrJr{$D?Sx^CT1MCkv7LfL0RtkdW3$V$`0F2)x)>}kEO>0_;! z^6kOHidft49=DbG4fw~kjmCo&H+s)w40<@$YDZh5S+4)_?ei+Ppiuj&c+YOoySpX# zE9drS-P#5=c{Xw*R#f{nt%6lWK^SRHu zfn0g$ra=Axo>k@XG%pwMl+j)F@&I=|JzGrF5z>Bc0)Y(Ph;Xk#jhb2yMskkAoLHK; z1n!l60tkDTl*`U~z4mBwx2-kSA7nftP7rtr&C&c5xIU%WZLl++u_?`^B$eCl~3 zXj$*I9=+!~d7-xt%{2&FnOGf|U*xx{KK~IApN*U^7X|)Pw3i~ z(CjmeHkZV}~g zjc-|p!|DW`g{vIfPO3eB?*@7$*Swq`oLKa>+RcSR>dyPD=V>Hr74-Ix+Q}Ou#;;e~ z+_+ByHoHV`;mlQ#wt5pICpbe@|1mVa<|K*Nqx$Z2;Iyr}q46PLh=#iXF5?v7y4`dG zIWn(mvYxmf8~KgX_KDniqIXTdpv&t7C;L`HH_9a&!YbLes^8o3=wrP|A#>COe+tr- z>Ipbmp3M9*w^Y~WC`F>!5gkS+6fF8JsvEp zVD0#goi?;f*XM8wgJ1LWt8?A_+q$+V-Ck>lf0f#%j4UFL?yj$uovruaf604V6I)uH zACkyV+Sc^fJszHy`D#7pgIxcZed-7p9dMnFkJ0;^o}^WZuE|*?z`4qu7T(|uN~DStjp}(FT!@`2VE_k0><5X z$%CL0t;(>K#wr-JevW>?tCryXGkM|(T%(uiZ)?pj6|<-s{J{u~_fG+fIA)IU*NDxl z?ph)WUfUwQaFlm(cHE`w3@w_u4YY9d+S_=VnKyi1cTp&5ncoxSi^F+Xc)WhWr!ar8 z=-#W0wUA4`-<@Dq&}2PGvm_b_J;@ZDNA%7*-EZNGI4W&t9exUJlXE|;_m?MDd-QDF zzI00-nI*=GweocKxU}(ddARu3($PA{y(&+`-~I0KUQt;q1)yY+_Et^8O9#D(G`e1# zM>Kvk!lSg5kPHb0y|lO(DM4R=1_RYX06>I=DrX@;g@=QJWR66Ihk+gqNfp;lB}J0A zJThxI8F4z9{aYsmw5m3F{ww%0+&VP#y2!&c(=7GH%uieXkAGxzpN2g`bU=Gzd!JI+ z%l=6Ao77R{1;sVp*Adek{?(g8ckp@v{pptWNO5VA9N#$Z=>*g{Cr-@SY@yvMla1oM ztrY=$4PGijj{fCuFEiV9(YK)(6tp&NfgPtqdpYw_=5HR7TErMY9WCVwr1P(pyhiCTDw{2M!-F>Ng_!c%e5pS2)GeA_+m#>0@QN9I~5CqLU z83j>)3O7K7Of&o+yov6^hzKvT^SE=R(fR1fL$8Q2jrahLuEBb4dG}cp*r~&P!@Iu@ z_&tGw6UUese5X^ruM{>c@Qr~_Cx%{;KCYq;wRknw7iz>RC_$L3I(m=AXb+=2>qWMN zC+uII0^fpj{1}RR<6fGj)k?g#u#ax^sw%OPbUl_@6Hm^ObAo;1KgB@jB!OiGJjyh^ z%5lHM{ESg=ZWop{ITI_9-M=<#bW>xf&BEZ(k?ELDz)*>4eCz~sy4-w96CZp^3{r{W zYOVt%oQ%FEpdAPah}=e@MfUN{=XI^&bEFuO{<gocKVqf7hC-8FkJcpgOvI6P3qrBP4EIkO-CCC0)~n?@|L%ZL*>AhU_)evoazFhUgw|=$ z^E9%IoO2AD>MS%LV9JRIBye&$m;~YumTDr z!w`oXG~GWa`AeDrkWs1tV|>F}tW)34G1%WK?0seM&NnOzKIvPsM&2BzLc8qGANz$a zhN9U&dDyvij@sc>`Ocb;D!4`G>X5%yYXKjqDzoUyZAp2rT;@f|dif{uw~p&WX001> z(Nr2+E5=ZpQ&x%PT3JvlBp?_HtQy2pD}wD9;q=0q<^A5d*52<3?wTc0R=b7nC8xEv z?V^u6x8-t{cZPFWP#960=#+MW1ekRA7@nVpEm2-}Zj!D07Sg)dZ@=4bQGV(|Du0#o zQV4C08$_#zlPN+o{E7O0dE({6ha+Rt^jZCBd%6ej3>XbAqWeeV%6mUyEd@%h*R06v zCfQ*Z|LM?7`+T9kJ&7t>K81_x$xTSfwFtC#uh6m`voa|m{^K#Zm_C@Wr%gLR=JE}!GevIMwv!b=#oXT=sWFkJayEA>};5F29{w-@l@UE5t(dW zB3D7>l*!A{aGv5R!dY+=sI2Uqv*~uKytLH%{l(x+IaQ*C{LBgHvrg)1xt53h?=1>D zp7ZfTybH7N)%c&DChhiD2k~j2N3+Ua6uJ$f&R5VZz1WS{ED#`aXiiSn@IFYLvl{$<@ zAqd7Tx-e>oM}L@&HbLyy(=9|L)rb-X%cbN{Hmb7yIY(EGhI~GfA24JnJDh_b9U5lP-VKwJlH^Pv;%AD*XWqYnNCc(=w^kD z`rt?(^de(Z&Hh&$J@nRX+tFNV1Z_;}ik3YaWriCiYh%TSVg_$VLE3unejVwEbH?~< zy?e+yiZ||p>n_0?GYR)7+1^k_naE_5OA<(-cTv-6N@mG^IbF_XICIh#F$kFGCeS$E zBQt|}Euey|tu1jdA?q*i4zlC518)RW>za4H3e#~6gd3GYYJY33`USPFpSa_{e;%0U zyZehVBu}C*olYblI%L48Gm_tf@)f4;6uGiX=?1%9Qr5!vnP)Rz^@>XD#jVwoM>(!f zbVRx0BO@sKV|OMF;L$2q?aLnDaUTab!IsbViTATkM^e^$NOLZfB3i zOoRNAbfwk-0AvIBtr61WBg=IksqWz9z^beCOw9XJmdP2)tI1yta zn9@y)2ipARl}AbLh-9hb(}*!`mtpdC-YXTPgT-cP-pXz5V{$d|*?iIRcjDENNDY9+ z7H^!gOIBBB6O1xs`?jPAR-d%_XUMC!0Q`b9m9ljC)RFSR$cBl_^TOFLiPM+zau#Q} zAV-O!s}(m9oikWhd{q6f-g?aXJEl6J4KtKxC#hh9UF(DG>Q~Z8_s(1}Xgu)zF@Ub8 z$!6PMukM7}*x1PpWdm-e`0X7VBX;qg?w4XOoAYm_LG*vDB|Qywbh=&lA7UXWb*+@WERT1>X^C%IeT}d zrB8nkTUi)m=FN^bDw z6?uZ*mF8=+YE`0SR?HD}MJ2l=+MM7QyG16mbZ-Fn2DrhU?Pa+xqwkgMO`oXv!%bs2 zU&FY>t9U8x7Y(J~o$(B_RxRS*#QZSI;rA$T)NvZk%1wFwv5jNH{kX%;`$W|z!=t2x zrxrLu371k>GNI+~ed&JpX_iAOQC;(1M>r_6t|f3h6`18HBax&Dofl z3dNk1t@6T`PA?v{$&$!KEXGI{CTgQm>Y_vTgCuum;xnsuXkEI+2%Wk5FRCtgFoc&G zQmFb7oFtvLl=0eaFg-`_tDT~RiORZYx&2R6BR%Ay!SVLp&tY!QG1v6m3)#P93Vg{O z*c-Vev^$6-=+%v8jG|*=boJHof7u}O{wz78okV)Gm{EQ&M_@Y;*HGavm1pcS+m3Mr z4MacU%)eqDR%j~I2UoSPYBSu~wq9>(KN-jH_o?2#FhLGt3qMgpagSv89JNR`I1S$g zQW7_P@~-k$={hi25FV9U)AVmqa{IBKA4iI`Fi3vSK(Ba*;oLrpIWmN?Lz72q!G{u9 ztI}foi&P=pT@UZA<`#_I;_X-H9@BKoOC|~xlgX=K=~}7L_^T8zNokJAsy95kpNl(L zV3jpV4_|OiVxJQr=ui(|UWs(9PP8H^#tOeGm*H?LJv8dPH^7}KBxYUMU1=Ol<((@> zoBT@24ml|niqZq^vy#s2X#{0hR}bhdS@@e4eLW*v{Pg{B(Yy09ePPt0at$iey_jsj z`j^be{Tg#$a%Sq7`BJ1ql$Xy@<&%x2^P{vnj@{Jgf+VCk(k?p4&P#}F$(YXr!@s4X z+59@_Jv7BFI{t1(kiec<%8$p%gYEyvHdvealio|wl6FPq(&VYMke!!#4woT6>@<5{ zWQQDRGuFSbR?9P#sRqhyKFDDSiUAk^-SaZD@M*a;nPP5{H9-;GPU}ttE6Nmc0{OgkF+#w4#ukfp>ea{b@B%iy`dZxWbQ;QczK;ATiaE-#NuA--VXcv1^QV)JP+Eh)OSqkOk@TzX~j$ ziPMem&;O1Ne}hdp8t1ygl)@}){a``d6GlXlS6eY76zq2WGu7%Rp}na%50LAf?$6ZR zexhwbOtLpZkbyj?uP5bIR=z^T$1i9p@U@y~6#_~{MToGFbKTqBoqM$x)@89EG|WYh0A-_Guw`M2lH>Rgv4S6L;M zY)QT@>w#Wo;tEiJ-&8yCGhYVYolGR-+zw_3lqJxDM3U`_RLXluYlP6Elblc#in(;~ zX)2Ib$jrwWnsy7nua;2h&c68?@#*Y(LQ50Z(o?p2)F6-Dw!4(qsSx8A$oX4ejdU>P z&Gn-ab-;ov*@UB79~#TQv|$D;+M8?NQ|Mx{TIoGdTJ(z&<%n<0JdxQ))hX{0;`O$# z6j!E{Kcl4GvXR`s_=%y!JO=p5NfK{ASsUnTsC9xSL;g@Mx#1t7@ z+L8b!o1Bz;v-!5xsz0f23n>%R#@Z8-3Ij#LofaZHDnh4YwkV|2FEsQjhC3KQTZ?FG zh>LgH>nY=kW<@8lX{%^fKA8O|y=S(ItC%m%=}wDEy!@22esU#K99I!@45Mo|cg8k^ zfg|3TzDw5xANDppcnj=qLOxeY8Ndb-0;CWfpUC*#L zqG*Pc_Q8O5eg$&od=DpNZa{@cI6u<$p5)jb4e{=Bg~Lf$kN zboywSxPf_O-jl47d@up*JVgPG;y^6bha(6ysv3p(8w#weTpYG%v_51%>QL3^Q5oc$ zK(8C)Z_gpUezFzPH7XyyV*K_nY5FmJa6xnm820eKu(xe6L+}R*b<2zv`#y@mZAa$U z9V#~Pv#D=XKZ4vF`wSJQcXeM*?03A~(89TiU*eHUAcpt7%Q9yXQ5jaKm|G}21{Z8I z!&!VMoXn%qBQE{svKqx93xIPzJ^k0YeL3BwpW5>Fyeub3c5eM&?=7KE@%W2=hY?M- z9R5xcGPeEx39?ZLq3ccDirc*iyzMy-*J%!Z%ng2rSdKz9Yva^r$sID}2@WxDu$6Zr zttCR}eI5h8K}+5}P6bYKJ zuV&oajdK7Dka@z;QOhbFgC59fLbN}+7wdE)JCF^8gvU|>Y011!G`1{xY2_B469rl0 zEPnIj;oCZqrnr~j^$wO`^sQxm{-5J|lQvu;kg3p}SwcXmnsCt17@fIrg?lJPEibHo z9ytdOxgEXOI@2GIHr7Exq1Bm3T0^IqIK-8lEZya}Zvxj=1{qf>?B6o~xEyBeJH9~a zD4CW_9UXs8UGnD#b~RO8u?s*~yI%{W`XBv z4G;fR`GZAr9KxlhJRkshmjqpK7>?eJ_hhrrA-aJq^HkJ9ix_j3_q3d-!SDDwOmR?< zfXd$DAshmqrhB*~FAk;P-EU_&JOEJ4| z_GfJDQ_6_ZUGQ4{INf=o!V;VW-o1A6V@A4lKx>r3A3GaF6+18^ZQrkp-uElkj0nGR zw+{?qQp284@f(g^hn860-^NSyg=WIE06*CIrHp^nSZkcIost-2Z?9Mwqh(|AJBFxC z+q_F2=0h3TpZM$PF{dqgfb>p;vcBTocNz{}-CX+fhd}GqDaPAw*LQ!18tbzx zvyE$5bzecPq2j35Iz#IQ);A1UGlF4TDdC3-1Rp=ek&jCZ6%-I}V+d=CfYI7aQ)$l5 z9?o%7`|;gK$pe6BGjI7}$?LeWGFw~%8HdegofVm#(_OqxDNEXG{~ag$1Z|e_1R@>3`YA;;SwxI6{1jqVU`)nU z*bh7|Lb3R(I2>1S<(0i1$Igr2Jz%I8PNx{i1_cIGC+>b+o?|8!DRy-h@$EKrh*+X* zQb%}*CFxRi{F~l=aDbz?5>h7Z3|KcxJj0p)r@_bz$Pirwza{VJRXf7W@|PdlM#6o3 zw0W*O={{QGH=`cRy;mt|HEtB%I$a5jR&-t~+;?O9KkqZaAJ*F(@0J%DBc7k&^x^jr z@||#;M=LbIyZFz&kJCC^k39MpLmw4~`;YMY*X4fxne_qlBShMR9;(j=>@}#3qi|_$E4m>V@N-E%3Y3!Xc&YcQ~q!%ErWvZlHC& zwyIo7{_2rwt}$K7&D$McI7ohlDTpnA7Y6z2y|cSa=Z|dCpZcDOmxc_R_!zJd)t?@dupo%rfXCOr$7anSDOB++4!J32*)wCr78b*`05*c zZGCtGGR5&{T)BljO%ot&WoK1;lqkT zjuDiUxFb$8p8+{9Oo;3NHulZmN6Z&V8v|zFKfT`Hp`>_1bz@`wlP47@sv`tz1!(ob z=5IFSNT37SHTJ)Af-unX$eG%2ZR9%AJh?)J(*jcrQ<+q^m)8`Yv@yx&ke~0gX!E*Mr-*Vn%x+iu9j+7`^&vxm zE!$IJvKk$epY>W}{L`Il7`f4KL*{fsM2}3lI3_W^hMjf`DXTXgxNyR>cj62N1%S3w z7JQ314vU)Y^=r{CX@5N7;2&Mh!{j_O^Dbdi6tbVw2g@)z5J6=>e4YufdkWwmbytQ2 zGQmn(V3I>Dy(yR0DG=Bu*Cfq-Ji^tJ7Y|E|@*0x=(Bxt^FMtaIlU;Rw%WHlozuPr@ z1D4JG4Pu*5NJRy2(5x+5EH)c)66amg%ihLJfU=swe#0w0ct)o{8<1shCeIB-&39uN zqmEEhfh=UE1fN#u-o%UYjh}qf7pH8Nl!QZeDF>73rLWF7S?KqzH9928$mY8?Wwt^I z@y*vu7XG=wNWz}ZVm+cCD5pzFljtWIBG#SXo?1raq2}_$k4IQ$al@1X@8n#!*&!|p z=5tLnE;?-Zro5D$Z{pIXLng#90E2A{mfgJ>bI+81H_v{)IbvVaJ`3BsaeHMx?HU*3 zuRLbE5y&ock{?;O$`ckjTJ-~e_BE@-k%~WA!N>m)`?5H{_^KnQyK84{?qXSIb%`fv zT$R@}%O0`?GMjW2ZZKd?*ODfu0hS5tJ6RS7*sC8B}*eik8Z6&!rU9I0RG7XP>nYRFM-`B#+Z+ z1|AaXwSe0;gxHtxNa92fvXR|2uZtmEYKLSs4c!87n-rW~UX7P3%qeIgkT7ON&Su}F zR5uAoXZe2ap3Zz4iTF%M;0{JNmSarDprjauTBPyf zA9KVluW_zH+zOOEiQg&8F>)nn`p0L@LtGgPms0tttHJrK2e@pE@z_kINcvIxZp^hp zktl@2N?Vu>iyc$=-NOmaDOd%tDK$@(N)xR{8V#|isS0fMY$cz)9EvPPlydSe1~E=i zvoEc&*4vLR55C`xF{B=M=s>x|kyASaBGF@_Y~MCppGPa6b(3(SkOL{hc0`I%jZ-kI zRz{(d5M2Lk=?Do6qiM$HkjTVVw1=)p(e9I$f)=N)sKFfqh?eEX@UA$Ml#?_t z8b?R6{~Q>BJP^2+7n+qiP>hT)0GfvAPs*W1Tn-+WxF#?R6eS!IP%9q@;Mu3G!~WrW zKr>(`^~W2=CpvM31U}bsL>S3F#7$TyF`AlaNg<4nS|AARW5|v249J~l@U_y)dLK81 zA>~Tz_HNibE!`@6>}k;SA{l%rPSyGsv6?@vDMT=^$@y8|(*Dz?bILpWK2G6MrXaw$ zY^>$-+FhRKwS+~75|z?`)=fo3aI*ZX@M`y)N91cmq>k%; z;g;f}#Axtn#xb*($-Te3PtA+C>}W!9qOPo!mIwZ%-?*5GZo5u-_c*O?>~TD)h_tg) z2E>8Gga44StU>E!Rl2D&n$=Ac9Aoplgy1#}HE$?18?~s2j#d$&ZiL>}d{}zqx6zLa zraIn7hBXQ_JC(g127%%D^~l>b98~Kb&FO!{B}ALhK)m8{b^*HF$|Xi3EGR0>rczF`9Rmj*6E%L672SGi{BGd?b3e?AQY_sS^l$4GV#2Eu zntkl0hgI<&lYXv0&b7$-&P`?BcFuHXb2I5Jc4U}XUD(Z>W0DXzcwUj{FMFEJz3<)L zkDRaU^@9N6U8jUDH;k0Ka%cPfq-~ZnR~DA#_B+kJjQr^Rp}O5hckchyaE#R0lXu#} zhVlBjwI*nnvd!f*_)Weeo_N7kK9n`=mpyuwK6O4x`CO^jE~p<|@*~Y(H#m!2B+W2~ zoC5cBd%`glN!T)P#X`Ax>9f#cOazDOxkAV?WT5l0|MUPCc4APE{! zA2aKCDG2I?P!U`}fh-ml#E?X&gj^XR@>TSmwX_+!&K|O8d2`r0hCgFW-n-sy&u)F_ zdSmFxan?Qd(U+$q*O?=ECzD3z$+VA^q#sm6=Ecg*g~$^Iac+BnV7u?)<2#iZ0)ljT})XdDA0;* zVkgZVI&`oJ6QESvOBouz`bNY2;IfGmTlg$^Tj|e$K#*SD0wJdQX2Ih(V6zN55$gl( zVw5~8AEdc%tp#k`aqMLp1NC1%t=F8f^+;$!!eL!|C}8vLX-m`{A7vqBc?Oi!kq}_A z8pbt0Q?Yg&L8Wmdc*9+!PoFmY@#?sL)X6oX@Pagp^RugUkI?i|Du0x;DE#ECaf-I-Ptw08H_QLL@W}#DKFsE2r9IQc;iu|1% zFJIsF(_?` zlV`>?0xV3OF$$p7=e4QPvFzPbiLI>?)^t%|i_>gF}$XCZXYOzUvjl6bNM%736t7J@dM_Ax|qOsQQ!UwTuhG=KRJZUp-~1~tYJq@mRR zYfPEMN8==b5rq}HHkAH`dWb73fQr2=$5yyFCI)k;m~ zoTfnoC?+dj{<0SC0j?=3$OWIC&;bcuZ4KN|8o#>W!$f(iDQHZ0|O}3+zD|}wB{R1K>^@G3VB(C z)(0kp%GG=*!knsRD>K_lKe4Fsi)FNlxaw-PJA>JjGm?ap5}bb(7*GOW#N>vk6$2}Z zW(4Y@oNWadQX8bKsmhSHIr$WekY02GKnK^bm+68f$Wq$XqA03ip_HZS4|0kjz`}$& zmZhL2BZ_nAU5HCHglG@HrbfkOmvQG|MIued<6rZYCA0*CRgICVR0pvNA=kp_enx6c zr%{?+R3q+&*o6?eP{6mDK=>jr0o>HrPQPBbzC6+t7KAWde)-VCb+sFVG_N(cbtr@(1TA{S zC(CfPNBAXVRI<;suHJ2&%3MtO-0BEp%_V}Q*~O9DkO`bj3;TDVw+ABi%aW$fmwoQl z0E5ji*)7VOu-N3$w@ws;Sne6b!}w+YH(QHVy>#V3#6oV(|__@{M#VOH4&zvQxryz zwXk`v`s!#tsQ`4y!bqK|SNehsC+<)wIP z7TR4Dx7$S)*+g8vH`@P+zRiR|DC=?=FXfR?!4onMZE)B3!7iZF^xMLyt>Y_H^dhKG zt=HL*>~+h_HyOldXFgZSB;L33K6B7)&fBadRJ*5 zP@;1Ro(})P(pZ!F06VT;2dhfW+KFJ~BAst+CL-hyPm8!^=xA$kRsiXBvg-0<;t{)4n$TO`85&;w&4Y43M22_}(-_R=~$LRyxnvRE{`)2V5oP=P$U`RUZ1 zG{2*-uEWuw;bygHu_5X$(F}krzleXiYU5)7A?!$$&@}<6w`QBt3^bkAYctL@$u6;Q zmHYFGT;$kPTA$}@vP-@lh_M9^QZ#qQOk6LaDhwtoL8WrK-h%4CYn%cXDDc9&tapwR z2f9WmV(C-x2Y_ktN*C2%A<_gH zW_XILWHw$FAOH*`3;+T`!bOP|qXc1#Be?_x0D*#gnRv7-h)9U2u;RkO1_OV2n<7LD zeNiNcAdo_b2;Lmi!EQ1jK`PO~_0v2M7LA?ay|$E^7($W`jrQi*?KhMoh$8Q&4&4;j zdW9OE(x@$xE-V<*nxh`Y@ta?97GBCrFx|!&$f&FqXV%c%V~_OmH_(FTd+;{bjeR7d zrvb$dgL-euNLvH9OY~eC?;iLCblJDycdk$@d!0;W@rnqh!$er>b`9ng=&#ljEKbQn zj9t_SGZ3buT|tUS5Y5RH3#S*F2lMX`DVhZ`P4nM<%=-N|llZD{>4+zu#$v+uWQh3ILuTt;u z3@XW#3SBZH76|DRe0cT?UQ;E)?1(9xbQ1ePOH+HShCR4=e?J>FBjG zX}tO+HgS4D+F-<*{2aYTy+_VK-f2M7@wMhw>tSyAK(aYi14yB`V$ZqnHn%()(tden zj1e;=P{-J=p9LpMOzk#C+5B(O5VNrB7hgHhhE zV`9dNmC@y@8f*j9sS}~qQbeP^K6;3xp4@LUv{$wi`(VVV5kI*pv{V+~Kv{vV{fNMCi zxm8y4hWjC(Bn@&uq`>Yw?u*`m6d!; z)uQ_^R7HgmZOhvuWQ0ZYMc=&D5nWR90@RuXu>*b_7&nWu1o7kS+9ssBt!JIZ?GTus zv*jID+6yQkoZttZ6RVDXZ@U|juPKMG4N za!*SzxL_fF65wPTMjME)&xztes0baaj_{kd*Jqu|Sl8xIJlZFL+1)1xl|vdIL^twyDSWgC#$zQ)9jAI7n`K{Zn)e1*ay2KvT+;yO6kSKP`C1k; zymR)EXVmFXf&ICxt?uUH8rhH#bu_5ovjf8Ge@M!%a9vr` zoD}V3^U%MJuh3!y!q~$vL0qSGUa@M8Ik?o39HE+Si;M%dMTDdB>3E0lp{qFU!qI+w zxa32)6p5&D5fB*CUuLttL?tNfQT3-E%{5SwkM6EZF4$^-wHHq+>*)Tq9AnwDsG*I7 zpV<6xYG-ZGD@L<03HW)}r@Ktr*wIWRA}anvEfo)3Y%>$}iLNxaZPirR zAr7zInF)o3IdV@t#|@0?^`ULRl1F2T7(n( zNC@0!wm#jDi-2?Di!#5;@AN!OC7rI&!>RR?q2(*cK}@c3vJGhcJ~$}!z)R2+-#@DP zsb0XXdJPd8@gT$K;Hq^f;-g2~OQHaL&l=dMdSKdxa~4)?5FEe%^ZFRDJ4&%Any_h$piLH^zN`kh!fH&OyXgyVy^(ExgBb8k#{N!cBFbb6 z&S^k!I+7oZORd|^vwRikLGARLwm~I$&{l^we5%H&2Q`?_76Q8cWk_qab=784vSW`P zqOlujooia@FNQg9Ytma-Yj1KFD{+VxcKXHqfr?euD?+L?i(1vPx*$||=f!M7mKXxI zOYI4mSD_3@_4gbaPq2kL-&FpTZhcPA3XEu&RC~>*9{?6hH`xVrvPzbZS>M7P}9n>jEPr( zlPVCOk>4+?Y7inm8t*sXmL)AACMI7Oc`!K^9GQWHhuTTYf>K01=6s!z85Q#wCcjNO^m39gN>y6^mLaM-* zS0JwGGjth!D7(eVY&z(LbqAQznk_!txWd9AyO zC6O7pm-&&Cgur|v+aKP7%34(kd=2vgi4|y9j+V8MLRPTe%vw@%YP7*!zal1%BLgC{_jJ`M&1{Y_ikJ)O<5p!Pm{gH)z-A|$T`92BCP zP1~bK#)mIVDb_)>icqv*G2CG`WC^nNQqq@k+FNJ$JBZo*4k{E^(ita+ehl@JSO|S@ zBj<4yHJj;*I~qI72}c;j*o#7SL7C~Uz>L|!-Ax=myx+?*{Tsk4mU#{QTkH43dR$m! ziR$-tR6=&~?>;6m%0a@WpPVh`l)QX~vp`yVEt;>KK)bQLIOdq3vyVMzi03tG$m+lO zTOvLj0@0ToJT|r4GZbKwmKyF6>VcCQ9TuC z7{~X1qh8Aw?zg8AMX);IMpXNTR>rL?%JZsSJ6FRCS5EA9@9+#7aSi9^D(fr@Y0~1m zzGVZ;u9Nl**X3U9ospQmIgD7zE#xS$E|r17C9ymN%Fo-`W73BzTg0vzL$}-Bb#v;L3&dwQ0q48~CU7S433^T| zjmSRp@dy*0&HAppdFjrRe#6{zy(?aZp7v}wM676$L?oAv%VJEYuN#oYhZTZDWG&%Z z`;LrYc$YkI>1=5kce!PBC(HC^j>N{o%*g%DOBM2p)ge8w>7&4Tp2U3y`JqR`HM~y^ z9n*=o=81#gfsBD2*5LjQ9;<>jkGwk;&X)w;kQswiQYR2+ueMPVhG~P7e$j zBfNd2os=9HJ~~v1i2GhUK&Jk*786fY`>|DkmBB33@tpyV`5OExko9XjY!rbQ+~TL9%g+#vx(M#22ZG266thXg&o!1d)u~@fn3s$VGqe7;2_3 zQkfhSql~Fa*`~kMQOPAAYOkL6lM}|SMNl;kgMV%nkI!TekvfA2bGy9|Kc4I-GsWxuJ;ZPal|s zN4C)PNQbsqsCMG+KbT_JB?6zkxvV{W3hXd9lbvH+Yl6<6Y11s*u`Wf=$gh*|q!s`u=9o3n?{F`?L{2)5Z zYO}c(en>8>5;2%u1M&M33QG-C>;Y=9AM)bQ-T?R34K+?N-1a};(fIp>QMN24`pbm1 zcJF%*9eZreV`L^ZCfa)LHwp9)KPe<}h~eKxVgb703CyvKJ~yB6Z{zU9g%W|y5< zJom+*WGAzCO}qPD^_1h@oy|Vpbu7DU>PqL-jyHO>yuO9LmzhnR^VK-4N33<_6*Ci5 z!+MN>Y|1T5iggUBYK=3|SE$2}@ahf}yZIU~N%a*p~k8V5b8flEN{CcVt* zDekM07Q_~ma+VS&5l$3jHy{U-qo$6?e(zMhmymhZm;gM4eqRH-Y!L7^k+T(;#z2+p zwLOi$d0pyDVJ42ZU3z2@KhH^bc*HN(q%~Qj-!iq_6;p`$pygWY$M~F1I zh=HV2j#Qc)F>FUopsWW?=e!Fb!2YJ5u$%8W4^Z-B2n>GmCzRpD>uHh@rACEcPLF`^ z2njO^x^U1Nq#sP%noRmRI20vZtq3G9lF5fQ&dca<1*k7~+&%k4Msh$eXxD&rwgc6; zRSmv(uU^b7S-?y^HX0e^9=QZ`jo2_mtM}Ro5pG6}+rq_0RC?(kA8D96(0E<6yXfzh|K>Zr|YSv|04h4o~_`BjJGKO?R`PUygf(;C^ z@!*|>OhJAx#Cu)A_v`Mz#H#Vul(s?2YwkK(Fm^4$O25&|y)Vf0ggXcXjT(o8B6e13 z%{@$Ipoi<_08jmV8HrH1ebFoEX9#KJ24#iTb8VYHtQ<-+Fcc0A711G|<;Z?#>_wh~ zXA05){-zen(+4hjW_JbMdP>Aw1IM0Q;TD15$m*l5b9^X(O_y)vGXz^fpD-%%tT65H zmMYLrOgTIu%^L*dUfl=~LN~o4fx~A@mkLfUVJQ)Hzj`4IqN%^njU@22|wv=%r(%G9hv! zCA_6sP+L7l6o~Su!2ufW8m`$E^QKrQ^5lHRJfPy2NU`w?D%e*AQ=M^SJQZN=58eGo z2)~&W1pu+YA>hZ)gquFW%{9e=q$BsX$*)(MRX&kwa3QG`EGM1Wf{N+m!=3Hp1yfqR z>w#5m^~9$s4M_erGRj^iU5cGn?tMAlCcB7mL??_~6r4sM2;h5nd%nCo z8CKxN<_-;^p6`T1h{2I2HVXV2%vZEW9tpy(Y&3bKZNj;lbqGFS=vi~RVU`w6S=uW) zFnvg}Z|bX>!Oa1Q5mxdudB_jko{z}aB#7;#T4^PHytdUQr+&DBV-V&#!ibjf&>gr| zA)z|MJR5q#QDetPy6+w!OB1<;P8C%O+e!6i?KVC2JH^SKkjBzc0IHysMY9RKWt>m)n+sOiP6TJi--d3NdK2jN z>7Sj+-Y4_HVy2)Zr!LJ5Bxp)u+7Ecf1{9H6jjVDc!RD<`t06~T@%xf_!|4n2T#`G5 zb?obL`tV{weg*^LN7GZp_j?ol^P`{he7#s!{X@1EdFj_O#%PKAt0u zMFX`{rZ{mL4ca?Jsa45F%bJp2A9c# z9~cOvl|S)r$h4_#o0H1L*cL>R#EHm~;*pxU#`yOU>z$y>Z0lZVMv*Swt<{z10V^z? z4SpmO`e9b@dkirAbeY>Xkxk)w)Q-NHS^kdSL`xf3SSGuvlA_o}jVg+e*!z7hhJhA4 zV-d>MAWBNsKTC(sOTseY=jWehyFsEkci4M(^0;*p`S)(0Jh5nE)_0AMGKBDBrkRq~ zGp1Nr2@)f>Q=VOpa{D>E$+K>yuJ)g!Dif)MrPQSLv>=$W9H)|2OJl*jB<-(jB=}XlhPzA z;D`-LvQ~vQuGEQ)N`I+9!REUG11!a&!?%PV_)yX=gOYcBz1GgO4m%>!ZRL(W zf{Y)Uxa(-<)(O4$YI#SB%e||=t2RsTsz7n4&9+gI*HKxWLkuj}-6wq}F z?+2z$gjrOe?T}`x=+Vgvn-DylsmBIE&lzNZMl73*U#_35r}qXT-r+6w1;l+wUzsf|A8R1apmyMDeDRMz+P_Fn6ImTgstcWCoc{5?}zmqkBmY ze7X11pJG>Z`nT@93ZMUa2JO2~^h12Sd~`&fix*ncD~|Fq8UnwFO{PY8oFITZnQGP4 z00f>41Qa4^_UN}b6?}cjv5yWjx3*LosF2ub{34m@F0#>M+K(xif(lAS^i3;dO$|jP z)RPzgqF{QC$6sH(!$0hO7Xb=(7*9cMH50>Qm6*FBFb4y_cR?5)R@l=$V(s8jVKrS! zWmdS!Pt<;|j9$dS!9;xlJW8v7&H=rA5==B;D+U#70(x+$-V*}9;Vsq*ijWt$UF!n@ z8OKj&bzKkC1P?(U4qJ}`?l%C|mP3ntP{(?P%+St{nkjwn_7nP3?kWaEJN`8xHnoV2 zP>vG#yFm;TieXC-JE?{Tlw~CuA%!zsc@PZAP9}x;TKffRF6-m(#nf)A>H`sZQa#~b zn(WZ-M+;L~>hz1)rr)rdI#6%6(t7oYY|YrB+-c#p+Rv<)s3DAK>rgMl|4h!m$x68U*Id?{n#9p?xV6 z#O_Sx`DgOypepIEThNoMQT}8>B@4Xw%ZCKEx$LWu`>)UfS5exmO$Z~2fyidb9ChT$ z4|5b!_lr1|C=%eyY3s02m^)y)se*>mI1r1mxXB!7txK4@RcQjlP&y~CFuP>=gqn+_ zx0@*H^9`ZWvql^~T|3%FeyDWl*-c9(IX(D~2oOUvY^j$y?n2iImYS3~t@XV$ z&9casUAu@yPIDv~)R`k_>Lth)MKQ#JP3k9HGhN_4W)^q|FnOz{n&P~JluEJ1gMEhC z>=-I)%10itC(EXZzA3fy84T|TE?_@k!5H)qJ&9{QM*%8>gTSdwJ?d2Tc>zip&0~=0 zGM4;4awymws6c;c^)eW!`XZpI@(n_F=7cU@bzgQ(C&KI_6YBL+({X~Pr81x~+|GOu zz}Lb(@hGV5PFUds30YY&)2B- z9ih`u;M8-q7axfBCGc#tkVb}VHWiBZ4 zPkoI4*6%;Uzcexa@BIFy|DQ_#ANs$}`^rzw&B#v4=4&%Ek+5<9SDqrGf6Mb#$uHxW z|Fx&YU%P$<#eW&)Z&Xz5A0zxr>3_9{t)aSyjghUG%fIpy{Nn%nR6af+OhN}kCr4LP v7dm4H2RcJ1Cwq57H&Z8POM5#)CU$xzdPXdruP)^O`~kUw0RbrpDlz>pZk_r? delta 30813 zcmaHSRX|%!7cK7YPJlph4HPdDv_Nq$QlwaLE7IWZu0>06Ey2CGm!iSFP^=WUo9}oUXZ`tBco}`Y73;Upv0kkK}jVJq(*gO!%F1~R8B=lPDGAM zeI3Ap{u7Pu4{EA;fGVPmj=;{w(#gr*(%H(!+|r&WnU9W>%Jctj^R)5xeX7$nmI!kuZp;sjzJAoot*fAvWZA6etw8$P{@fYztUy=a_UNNbLXb!~dUd ze=|bka_Px)mQdBmQOJ=UFI7Fy=1+Q6Pu+pb8&Hm~X&hgHkd&%+7>vihT%#+| z8GjO=2j?y2b~YhFC9B*4in{WSXj2HVoCTWu^%H2GtZ2@OlRXD8!{NtHCQRa~kz!-! zJAz9z?O8Zc%FK}i%uiOO-@gQp=h-Ug@aPnmar_1r^$IMc)Ws@o2Wa*eNzo{A z(&*8skb?@0Mq_?rVMPc{qoPV5FBp2p4Gg-JSzThLIoVSLy7i2>C>Q?51`?Z5;X=tz zk#z>LF1|E``Xw#o6^B=r8>uL1B>bKTPZ2T75OU=}VlPStsUJd7!p(x=0Dj$fmu;5=f zZHBR_1nMP<3ulu9IB6mpeCS3)4WQsp>icuLgaqA2#~}LY>e5fLEc5dGUus4cUONtj z-hrrWS*i2i>4*hcIPoKS*#21z&jYBZ%$3H-T*hcy^Z@lXMivEvmsmyN%ykG4$W8F(VR&u9!h?Ts^5#th7FFv6NF|VT_~Yv z?#%jd{p@Yf+}8(FTUWEZ`)FUOA|7NB&}2sFP%}T$T*#q7m^9+XPk8CM+lQ>jLmFz= z$d8x@Rf0m@*az<)$Y>?Vm+Z$J?!edzf`mO$eDtp*%)gQoMuOEUmB8xpGQe4~in8*Q zInVt@Wj$tsN!vKfiF$#K2#-fJsu3}j z{InyMpExMSTW%Ku~7;Qj;5zU-tKNJc|J+&iXv|7tD zEkp_pzeXBGo{|)-ANpiJ14t>@VM7SI!B{~8K0 zy)=1MY+`W*!mnS9v?z7It6`)Z@zY0!(HLowsM7N?r?Y$ne`IAz*KRVRd6n|-Lk#C{ zWq^k^)-O|p-_$27x3h2^kL08`*g{7Psjcy7Ih~fiG+r-B z*qr^c{5x}1^d^g2#%~vyd-t-?fdMy(;S*9!PU=5!>`#pHT_J@;OfA-AydasM-UHc5 zX*7ChKV{+%v(-wH;}NbH3Ch7lj>G&lQCO5lEH*1R8DnQO zFDT#&Zts)(kR)7fi*4LZjPy>OSj*F?mG&L``Ee-@GDDa!bBcSxS`&v;fi16m0F(v${eNWiQhEEr}a6RLQh{u`Fo5J?Jl1GGHYzRTb{%*?1 zg-J8#ri5&rW?{r6A}8^4r5lx}_BfW)Aj(h6 zJJvFz=Jq<4(;kj43pb{uW$?8~Nzh{|Y9`_zjTv$Pb5adaA;+>%;&U}}@0L^>uyUAa z>sA1x*`koB2t+r7P*FC$8f;nf_kvLW`-=3hh(d^3nM567LkE1WVp?wC63qe>j54X@ zB~GLqCJR-c)^6o36;l~SE7wCT@}-R{$1_ zh!DTR-Mrmm;M;#m9x%>t>X^qqQv%Wpw@8(I29s*M`(Ayv$l@uXZl3^$( zioLD1(a1+R;_u}+IpiMdO(8kT+Cw=z%0tDqD!Y7rvq{iuk z)ih2)Wn?w;_3YF}DZm^6qUC$YK9ChBMQ6`&&bn-*fftBV?q;YkAgNhc_S<*A$(n_u z><}d%NG78;3$<2HQJ&(EJ3yyV;0>}EQLI%Z5d-c-#;|XTrVQt>3oB@RFE^n)S_uDz zv~6cXkpesMVp72*!ZS+=D<|a{{~(@-LG;mRB$9F`c+k3Vh^Yz}Q;7bfc&SjnonMfJ z+NiDogQ4_R>FLUk!>wwgb_qkdyd8PqHl;h-i9^~_yt>eh{bgp)l< zO7{@%C@UoVeKM&sd3X-PS3~AuEfr=ZG&K3;M#H&e+`#}@EuB%7eo38whMK>0@zF;s z8k6+Hcw+U`aU53u8mIq=W{-5@pFCpn(g&q`?t_tqrz(?@{OcocO4eFzWQTSfmOnpv zcCu!pL%omnk;=4&O6l$_BrVubRYpIk7T)nQD3frcuv1&gXz8iYA>sx<3o*xU?!dnS z)I+H34xGzsjVL+bI5KrL)R-Jqxf~qqnl=zYvo&jWGZiHPh|3mrU&~HVjWn5%007|7 zBEnPANX0VtNG1`kViVLF?l-Uz;8IpiL&btBnoKo)ufbJ;Hzon|-;vFh|71rQLPaRB z$y!Jy9WiuV=3ojC{LM(#!|0b#m12?Dt9j4rSS1}T61-m@#0m4+ej&mnBPPNPsn53; z8I#Uc1_{F=?kc#PP*jNWD)_egrbPNYPV5Nvq=7H&UdHK3lksr>9i!a~;&mbh4MggW zRcG9Ux){x28ox9yVw#3QERRuFeQA$T35RTs0QkNCkWu-)_>pulBak9vL?yNK6bRxu z^$;WVA|nF|Lu>~|up8OLiiV8v5W$Z%$V?xHIoDRsZlkMgr=m6J+Y2&gV!|YHB;T~G zK^6`2bw%onDhvt30W#|*l|u`eh-Lf3F(PnCNZ8o%ari;})`b=!VOUsknB?e|wygBO zv(r(dnlUtknlUz!wnL)n2WPCQTbi6!CpG68=ArvVXQ2g{uQi6@;sKT^%s>%8~Koyc9dmuqR<>8dQA{*sgb(ZR6Sn*pXJ(>4p0Wt9D=X>uEF@qQ8U@HmmrG2v*n1DUDrK^?x$O^LWR*NYo$f0 zV3IDTLr$_yN-83)XzJ*Wit+d%N9SS1a5w!#R7jLET0+^Fm>$8_JEFv#OmbX>m(jqW zERA_x0suo6%6l}d3x!+qKGZTxr0_meuzphHD`W2*BJ5F2D09_snPRNQ)Y09oB4k!Q zz1@o)m}G98rzit~c{;E?X(|*r`m)Tx!njUHXqGX!9-1IZeXvTOnmx>SytSRU$1*@Q z!td@@u<>?xuybHb+q-38c#iSn;Ua4>O!>EL%I@o?)>uKogngcS_ruA2sglwBU^>A}2C%&!DrW%Uae~cU}7q zQSFaj6Y$=x3i_F{snW|x^0VXVKU*<0G(=h@@|?teNKUcRU|`2lCdmd9?DQ$jt7!=t zMRCsuu$QFd*cVX|vKtwxMO|3%&RFjYzOo-x6&x#smfPeOIUXDa;TP@nF|a#S?8j*$ zoVWb5Wbvpxx&q-)D93x8c8f75`GII4;K~8o*eg;i#>L*`tYI=ro;%#0Dz2=zD z!7YRJz{G*2J*SZt;eY-|7>AiCrjXV*GLoJ8!*_E+g`;rD4e6qH6BDt5HPbEYB9MW6 zDfC0mXwF+5fo#m=7<)E^&x5$wjnlN$lR??)=sSG@z<<3#$66{>gq5ExQ<4>Igb|54 zE{_PHf0Hi=IX26NA=;oMneRiqy&lP^2&v+AwcU~WT&vnR9u3OkE_#_m__Ni+1p(y0 zP_3#N0XpBMmq)Gtp3ek@AAnCrFGOrAOmKUn6IDv|A9Q22qE=5y_T8v#KyPg?k z3hc^;$$nmP0nAf&X-MioSFSY_V9y#&r=86a&8wS}`hj;A^sT%YRGEf?hAkdVr{th( zkFJg=UedvoN9Ss@@=Tf~LfY?Tf|9-<0Peh|?xiR&O)eRzrQ%sM&x|lQs1k7l0a8d} z%I!5&i=mVRWEDFWn<%tdsj&+Bpw!Y-V7n$3T5yg-wjM}D?^1HMSogo0QH?!>PU&wF zVKW=KdWr^T4)9y@bQ*)b04+ZYMhuurG(G@`xbGTI{rbJQv2_N^RI8ucqwym==7M_a zCTS{;c?4Tfdf31Ih9QyXGoI;FQqOMar|~Z65dZbIT4Y{g7^i~+LmcL1a+090Y@uzA zCRz=)um)>1jb;j~-ytZQ!~M2BDeV)*kG_eDWHk$ou^{GTR4T%>N zEMgdr)FM4AmFRIZfnoM+BM4L3>(6_ZkB0p#b^t^Jf0qCJZC4iV{-53GVha!}U2P3+ zMV~y!gBO!9UnFRfGlqcxP^c(P*iMNH`a05wj3P|Vnq66$4ID51wN;!$s3MT0Yoww` zUk0R%2`(+6{4cmY|3ld_d{BH`6k)R1JWfPO-RD);*KmudpT~H594+GfWwn2r%>%g{ z#Abg6eJUIM{Qf^ckX?sckpfO&p;Y?I&nPB46Lk(`o4}(Pwg)E3?ue>mkVvv7lnBsM z2VP^xTEkg~qZl17@$+L?F%%Wk{;=9U1Y<+jBj*?LL}%fnH>n6M~s5^rH_`%`kb=oLP0^#RAx4OsI(si%RY;*@US0^jJXiRtPI-f zFD^u6d~|f54<2McKdCX>$+}t!IO>uMDxZrEot}`9L8aou6!>CP1e51X?PSP+>-Hxg zYC%cscx+}F*=#WR{!d-COF5RY^QlgCW;LGXopOd+vsF*XGa33HYhABgSzd~jhs zTdX2qjqOiCa~MV#E`AJ(|0_9LoS2--WL?djLT|xrWe|;CY@CAft|$Xd8%YXT%2-lS zNJN+%G-AK^x-c7=rNsVKs){2eHx{uj5u$fe!BOnFw5d{dJN;fK0b9Hf92!y_vJ?8( zB!k6(WCwt!}PSmz6l(Cv}iUEpj@Wm!+@%4;Y78JVwTv6wV-{q&5R}&^C&io zI){THXyJK+IVmEf)EZ^V$j(%`bzeh8X<))w<-=#Oee3@sj?@8=^dqFuzNn9!R}xQM zr7C1w0m}iiu1g*Zd3!AMx78dpI=kTU z4{U?XTn@r?GPAB9goe0*nmd221(S&ciUc?c4T~2}g~Md%(MX0nJb2)WpxP4Rgem%3@5f?(??JI#^a2HE&HZX#vzg#J-`jhS*8;@MSV zVy+M{X|b)wOkt3MuyZv-`q}%4PnrE>)AqkGTASFEWG|Owc+Rt#T^uA7*)P-S<)Zo#pFrQ z*Ms*6hcQ(iCY9|4$jVt20pxd|bKmcZS4PMcX|hKJgo5gBf zo=FWh(lY+9Qey}S5E&mR3J68R$KnrXl0N`-j6nF4z!W)64$P$3Bp^<1!SYt3U=EbA z5Nu)sZ1RRJsMReZeUhdm0uj|8SndPw-&s55Xsp$4F=KDuzsSKfSYVRqbLAOBL2%T$ z*XxvQ2KG^rWCeY7>sP7KVa!Ep$)p?!EWw3LMZvOiZpy;MzeF{f=E7v+^8i%b$|Ezb zSwj9jg;d-!^_;&O;!C483$uVtWrq!Awjui^0$egsc4kQmizX{#fSsga*f5_+E&JWl zs5MHqCQGP_9C3v$Dl-%EgA}5G$(1e5IcdI`VKt424bQzCRaQmGAtbRqwFW8>2Z!Kr zg8*PnS(bh@EN(M9|}=K6Mjcl6da6ZbxP}x`20gz6rfWQV>;u@h8vG?MLTdtbf24T z7)CgnVDEZN~sb+P{HT<;=t9%?PXwQ~O7XWGVAEwZGagEef_TD^XX&>h8clR8w8ZewCxIzH3Or|uy{+YijrlUS}iI8YW&D3*xr&DFJ~R*2V@*i9x)ks#O(NAnQWqh?jA;T zzQ08kp6ZwRnN2=tbiHj4P8zWXLEVMfWM;qYd_0}R#14(m@M2@B!pPbGnZ+q%h?a;x zLIaw#IUrRiSHvI(kdZLp)+p>d$i3x}$W{aF;KqbueceJ**$v@9?P7N^0hku879m)M;CdufwAh&6$jwN-5ZB3bnRIfE`y@Rl5y1r;09 zRMcM{s~O>ph?-y&k!|85N{b8aD5!oc&^3Bn&&$w2BeLUJR8kuDeTaeXG$iBz>I|zb zF`?J=ZJLw`)rIV#wlP3MV*Ch{?OHFVOgu6kl18L{+w1yV>bEI&F@;PG7RUj%#O&Y^ z42QfZ3=CYts7YqzeK2K`mA2%g^9fB`V=#@9V$z5aK9(+-QwTZ+)%nc;DNhF|J)pQU zIq@1g!&uW?%@#fs@ZCFOzH~+uK}j%B)i%ef(iVoSa8zd4%Ub;&>kml6_>l30d?_4<1F@~d!k$uYBr7jM4i-=#&Cj8(pip{RrnXK;RxE2zp=Raj#e(q>7RSf-ZQ?O zd0bJpG*@|y9M=)Hdre<0-%xX%Lzg|mH>z*)kNrC*tJgM%_K%#XoBF2~H^^ZRBER-G z+r9UE9@HOp#J#y3(m`$VX#FAyce$sLPh)-o8!TJiT*h z%_KD2=xM{d!~59lV>tDh$KtcuXZ@wmZEdg{tMRjyycJzaQv ze1()Cu&?h|$T6(OoayE`&Y`uQaUP1mx9ynW+i z-N~wVN7Ig%M#)3?G>&7<`jkj}I~}pd+s8fu4;Lj( zC%3E464G%_Qr&;ecFYs&)|_ZP0-xa9J%{7lKBgGvCb4j{Fq#9gcMFsCre z_b>7vRA?b+l=ggBt-66<8(OXIn-y5W>A0F;r)Hm?u4%b?%=XRb2Lh+Z;wWK@K2CQ= zxgPKpuGy#VrXh?L__0vSM)kV$R*&KkJgXTFHF+_}Y4VDjXW{#|i1B`v>I&Z7({H<; zNb}xgzmDIthlul!*VBFSnn?MnZ;yHI>(z)A0p1&~(@s2xot9*CcysGx8bXG}!>lt2 zh!1>;YZY+jc(e_te4k5w`r2jzUhMdIQZ}*T?YZ}I)mV1Ep?G*iWpayCa1vVkhp+|O zb$ui4$qMVaGC1hLd;i`8et5qQI~c2NhSUH}!_#)k+vFWtrG|l}&WgnOLgP zT0i6nw=YpB?gjE4U-wL#&5-7tG_=J z-LgHrFncm<7R(;J_NSU{s`6u>$IXO1;p(m_D3(FVQvy7%X^lcAa|!I698hm9d)`vlylc`4X>&9uUGMwhUh`E^b2 zFm2D(4&Q^pi}{lAa$o%OQ4ajT_~M$P-M?mfsr~xCo6+RGKm2rF43_-<;ue0|yavCi zJ?lO(vlPy&e(Dhn`?8KBe!A+*h@j-PWDo zJYB&v+ZL&ohd$ff*qo&wu6y?k9pG+oy`1eV4#D~iZpR;ws(E7TdT$n%9j!JJ>+1Za z+8g0i=2DF|hex$MR_f8S5YJkAc>66Oc&WXwuC1?h$OtP4>0=5%0~E~3!BgQz5#H)= zQ_Vwyk`53|4ifa5`P9Q0$30|I%fQ%ZvKZ6U0;|62dUGq%-4R|RFtpLC;l^;YQ`aIs z0%180t%I}-9XhcnMfN0J1%{2+O!ibWIyVj7em1ER3c$LNkfvCKFE>1S^TVF>S6f=v z?@fJH%aS)-T0+-FppDm#4e@R$exJ$&e%F0T7s{__x+kt#xC!fBYYiNP1GH(`q~*{Rb@rU+tmFLHX$tTG+r&!+Az;JU{lS&cb!RtsyYKtW-Flj#cjA!at_;{o1sr;>;m>%9zI3m# z)qfom+7s}w;m2p*Lb2|$&~lOMkMHYSZ1YVXQB}wOPO?Y>6L0FR@Ad92nl0+rwOn^Y zVV_NQuIu&;paqn5?eVc3tLEcpb>=Tu54fktd;SZkgi)1 zgO&}lT92pjn$DW}Ov=`o3dhbbcz(@3VlIqUfyZ&p5YKqplIKdeH|{Ey%LDq2>>i{i zz({9(sp|FL?FIj*Zoc8g=6gRAXY&#$Uzsy~-JaK5q??N)uby>jGiQ^!mWkL4W4~O! zmg(`Ds*LpS{ z*ym;DYE(CP5p(SP^!(QgG4PsS-C#>w_eslY`!U~3$aqa>;v&jTe)p4E_c@fD_cG?# zc;nu9ySuLS&-m4LV5c8K6((<2DHi!!`uAGa8y(L`GN1g8CyrpBou9}-$pI|G3 z2lEe*I}_L3y7eW;JBYI|!v3x%eCsU~t#MbGPwwl2x-Eawg&~VAYxlUHqG|K*VOCEI ztsB*l`OL1t*V~5B_bo85>aN;`zl{cd^uJms?av(bTTYK2lCBzlc5s?H;;t$VS3w>J zad%>Hp%;8ze`?J4eJ?)qt-F{(4EX#VFWwWo9Ki|<;Lb(TJai(P8LZ&KQy47lh-;z5UJl}$PZ<8VL?E@CF%V} z`%Tz2Z(qkErn!F))MQO$pLDg6GPtgb*w4e!+Dfq!0WKC$P?B(t`jEk;DL+F|v4+UR zlC@<$a)7uy!8;&A8|&V1u)$ep-u=Cx%%I7ht%AqEnGxpj5`>;S!Y>mG4`siSn~qAXAaz zQ<0K`ij@7BZDPt)`u{uG!_P)1g6K62#m=rb%VIapLOUS(GxZD5}20FDSX5(@;6tofu-|#>XLo_K*yuH5GKZ{N_m1PNv zu>hn0&cDde*&ygcEsPX!s%Nnc5AL1h|MqURcswdI#Qr<4!oVM|$Z0R@kT-~oc9j)} ziZvpi8o|AZoS6ENM+N$pALTK3)6jHjvdhMM&&tYK?6cE@g+~o2j(~PJ@YL7x(koHw zML%Nl?!M(OgXv+-iA%Pl7d>^mm-C*Lhck;KWZ#BQZXpr!a!1>!xKF<((t@(~#%J94 z*sA-n1|Jq)FYM=HHP+NxZ?bV5|7Uewyk{pd4zlBN0Dr83=0YzqcN7l@Go=kRrO%7fX7pdWk>qFNQPY-n-XB!lXH?S$QwF{B!KLs}zKC6ctCxP$< zDwCxz&G9GZ|DfTgZT^;?F3#qjBfj+iKKD$vIr=1@SWR{D$wA>WU0%1R3)vEm$0wB_ zAIOAAJJ(~t)+A5-r{i^|7Mn?0_`>Q1d~vl4 zzW*p2xkjM~UANuniSJ?##_-RU>RC@*uep(RX@JNlLSG)fyk7rY%L7|Kpc6OWe{6uQ znXb`WjZeFpON?XzPi{)RALz{bgm|HkA^ zmk<3Sw9BW3pl6HgRN(QJcH_;i*V)=%f9OlsNtZg8;4S!t0LTH^mN@HE%NdqdRO@%%)|#ns63w3VRU zJ2U|jcDiaZTsPTu{qS;{PDQ=yOtVpOb!{H5I~^UZ8&l7)u7TU*Jf+k4$!wPZgD*{xQ-Z**wfs7$ z(9Mn4NH@NTTjJv2@L3?O?rAF?3`XHk%Q?N-xP-%E{h-P84>BZodp)O z$=w8OaV2et?B7p3m@zp$+IKb(aTP}dUSQ0SKSE&?g8~$rRl(0 z{>yd#TH4^I0pba%SyWgY`NQJ@aV{|52py;5;_VK!9a=d%*;%A&$)0aE@zEE*cJFB) zSzvVB`RF9k_;(W4uz0&E=onA8SX6cCwTiPWcW{3SM{on(=kIeLAj$I$CQAqpha~E4 zU5gCq`;26W=|A3V-Iy#GLYJSUt>e>|AdE2S)(fMD2Cw@0<;IUs`!?R;@F|>@19O+^ znKf~r@QDYys)3e?29kWZbM?mRbBCZ4Zo$+dYZnYaT}l0{j$R=7F~tho{nnBwVxF5I|g8_|0_)Lt2x z3|CLPmN-J3+twW885e~eV<9Du;rP4~$FWYij#n)+M$rV)RUNTTTTR{_@zXJHk@7QX zt2|76Y6u$R6XyF`&zq&gXc@T@22YmmO-&=K6WanC=B-vSjNUiJ|GB?~#8!Jc9-khq zXS!7h-K>kdcs&O|$I>Er339ZpZ>{X-Ocz*}U%P{^-P1<=8s){v{heva)UBz4$o+UIW?{oEmU z<7d<%x3KBQb+XXF_4@2;VaSEiBk<3TyQ5VrKTP_dbt9$+MrTrD<-9zIW6jH1OVLkb$4jbAh4822=X?50CPz=FHglG+oiKNO?jI7-mcXrFjzAM zpkSG%qbla5i{3;URWHgf5{GB&QCdbwhJ=D%SYC=2Z3?2okPqeu5MhPMTJlq2;h+Rr zAW`9AL`2G`ifW~jBIRviakG+b+9~5w)6*Y7qdeCVt1-;enI#BZbq z^_SCs%ZY@vcOSbwya5h(>(UA$6&0BkT6JJ8#W|X@OdkdF?tQ)9@2;Ju>x>Vjjls1ftfQM_F1;mZ_8RAYd5Y%Doev5@ z3TOT?ZqNA~B13&<$V_BzBlL)eUOz)ZIsD_Ea@bUHd{f&zF-FlE{%?ASj_d&Ni&i?7 zC=Bv$bTD#JX8qt?ECGp-8n|rH)v4^Mk{4Pl6;4@NHZ$uH|y`8$mxLS)%?UUAHrWcT`d1!O#tU;R@*IBGd3N2G5GlQrtQ5&LS zkGoA1qkTt_Vx6WNH6JT%*}o5Q$QP(7y**;f>t^`cqB!m$bU$_Q-L|=pl_vTj0OP7rLSU^|QMRG^hv#MbhjXbqUN%5}i4=nh7!5S5<#L*qMnE9i0KI zPp#sJ|Eze0jDO$(m9Mn*E46uepW!PuuE0lSN!(T9Wnn;z0qzrBbuz>e^WuBalU zF3qaxqA)mQ$4gcsh3mfe1qzFNq|NpyL0d8G#a=vQe;ju|6hjKS@HZb-?38efr*LNs z`+WV$ykI&*gHb|QumEJ%l#kpw)KxU(R#&IcAMuvbqUijqo^$FH|29z&jnEHh#~eF# zETKg)6Q+3fW?v#I5)#tA`u8;Nx;Loj%C1Nr}Z1h?B_^^ z!!JwwK4mM$N$(lEkutA?(GPD+P<0FK)BEj#`s(c^if2FBQv?bd6j07yMtUT-p>&92 zou$b^LOAtyioFsGLQssd?$h2N%p4(-()y7_J6%PMKbt0UZynA(_udOCxB`gOZDbhL zHpJX{?_GNs`Udq)>p6`uL(4(7o4mICyt`?TT%s_SxM26Dr+75j;pahm|5B^Te zsnBHKwvP3YwaP&YqOZf$nJ{mqcvGwRNR1X&56f*JO8a>fb$TOplgeSXdBwI~B!GVX z*zRpe zNpM`xKf}yp>j9X8WX98m7N?73^(6-Tp1+$4?VT}X-Yf-vCp9CI=@dPLO4|ObT!U1d z!2bNkyhLK_p}}nXVhW%Kcr5*N+;ycBn1Q|90&q?9 zB;QW4+hW-CE}T9L|F|h1@#C1%4Rk{tR@~jCmz~~>xHWO4b<^q!ADs#tP zw)1D^FVw1*>YQwv*3(-ub!_q?^w_7}Eo5Ozy<@R^|4u#{{pR40sK@5HI&VAuf@@W| zlBJVf4K+}Z3=7Rlv_=IP!m!zM4PbUX{B%DTj$Rd5w??5v8a3CpVu$U&QFto&D<1jA ztQ+MzWIeGgIE)c@@*lY*lM?*{bGR{XNwyJEB1}k{QqAXo_FdK8o3_nX6TKc(S7frW`6Kj9`~*v*KgjJ5`>ki zTgU2ae(K{Wv})4U%1Ri;S0&n5h$=3?Q6v_{#N!4PP`?J%Gds32^&9&9ZS&v_z zGQtDve=aGt*dPxUL^r~>PRChP!ZWNts7*7we_utWFY>-@a1P^N!4hkk5}nkuK~zQT z)HsxJFIFZsKx-RdW?k;psVhDeKCf}JHOrG+jZzq;*TgY}w7A(ky}HqZ;j`CKjy#(r z`AQ(37juYXHAw)~cp$ygq``e+-Dq&s*CHEi3+;g};$ZSpsBOYh(Z3 z$EzU}5lAE8lhU^1g}?JSQ2}!#v+q#wiRdCyhuhCmDmBkq)V77JfGb0s^i~aI+a)dt z{dgWJ^*2JXRH>(l+VlJ_?YDj-uCQS~anO1Z??m%B3D zs?@l0U0;YlVW~f*uf+&l=)+b(JD6dTG$*c88>}uVeizT}X=D+TxWEl|jax}vZ2S6w z99v}ux$7OLCUTc(@T*9Na^EIDxY{^$PDDzRTbVD3Rk*#>{=KEjGX3YzS~;tlZ8|$| z2^P8(na4}zquSa*-08ll1v5s@7$0B%8NQb2i`&|B^UpA%pKJSY&br;&#s{0;50xde z9PXDIz=Qn^NoftgO1o1>i!3z)_&dn1^$J#n%8$3L2n?ZS;fRmwW8MFCAle#-$|~&S zBc~-239C>v?nZV5k$k{+S9>Mnb$94juwyVf1 z%uXH%=}8DA^$eUz{xv|Qu3Q-_Hc;NKyr-=J% zds-9rOfB0K9t%(33qIpuc7<#_^UsS_j{1o%j@t$gJo>7=Kw5PxlQPxgf) z55YQkVtDPO23D`TrB1kGOf(9f0+CIv!e(AM6TRB)*m7XBqhbXpZLeycGut(OVex)j z;H!{tCNL34^2Q%0cXvf`jQhO=e?v7+P}v;8&KPRW+staxOF03q0O)r9-_jHM3tgpf zI)%2NoS=Bc$#4TIQ8tMoGR zU9q-QX<|(OwNcw99V2XtZace?k?r~;|I3daT&`v^sud{y1_<5nrmTxfz}q`dpTGAT zM!~|4nN~_fjTIU$lODto<*@jQ@z(Wj;i8m;--cTlV_IglPCE6nC)VP%)A>m8eUNr% zb8u1FtcXeqyE-O#e3`BsJIhhQ=y{n}LP>?z#Wb6(CyP8=idQa|W|z}dBz*c)HO8Xu zNgNVHbPKh~P5>&w@$4imVMZ-v1|(yP`H6z|S_EJCEoq!y4;>xmTCM2VQuG*7r@V7O zZ-qtJZQ1QZfi>N40I%on@1g;uGB$Cg-eR%<`RY%leREDIiy`jT(w1M`3Es1BcNo}F z3XoY~UVPX1{z9mql^x)#*SM9wrgGos=Ux)Mmzzz}-dqmVbtf~9||fZOsXPP zKD_~=b_kXVW0JjVWGfOg3HV3OoLx=wJCRXLY3llWloB2#1<{bLu5%)M3{6!gNU-VG zh*oySL`LH?R02f(67drFYgysc_NF_n8$YdR?fXZo5e1Ji)0coBLxt!EaP3u$bSWIS z6&EH0SELaS3J8>@DL=B7ohIV;M1xIEfRwG>tN}^B)p_Z)jXn+I0hJtA#IQeou_G1b zqJTgUplyw!r27>~uXL8#3OIh)9GVz&3HK|_`u;Re-u4& zG#FMViQzd)MPZkup7*G8)!(#!m5W(-4vzBHKA8L!VqEyaO^-{WoZ8K7vXHxio+>+VY z#Kx7nS+>#*{-&tF^>wOsE%17rL7TMP`ZWn+YVO1$Wi(IyG0PI8GPO6qDq2uy*CrT| z@ArRdd&{W0e&A6QcXxNU;_mM5R@~kF;O+;v;_mLn;S`7B?k)w270RW*|GICz_rBc^ zcdv71O(v7cWKXiQS9UTZ4Nzw`f-WrYO$aaFrC_)z(o_sIC&tmEPMaMqX0$IGV9t<@ z6YOnRL`xq|<(2JE`Lhe$4$CLPLC*jr51c&)J<9&fcPBw}EKF@_hZJ^P=HDU$E(sem9vUu*zSi*S|F zkG~qqTCIJ&b4hbI?mLR6rn)C0w=DV?uH}30s0Lk{#7!La?-Zb|NELLaxJ2+8-N%Uw z23xwstWneYe5MWP$a271!~s^2=!T#?t%j(58oa;GwnOhl3-iude~g6Uk%E$8{8q4r zbULzwKe&evrKuTF8{j1e7MrB|GfLJgXz#_n6qTee0=|;z&%iphOUZ}|h76U)Ua^J0 z-gqHgIeN7m0c@xn^U%{2P?=-i^e=pASphDIX|;G9g(w6!DF~N38NceQXlsbXg=UU_ z1nAtha8kBw?dx}%nAnFBSY=2d_okG1z1p!X>D0Ucc~ypsb-Nq=$oU0*Gkx$JE-lKj@ZV&G2hWUypC*d%DpC&p zTBy?3S!OFUp8;O(1!&LV1UI62Zun(GETCFAUmtgV{gYrU;Wy)t7`Z|I2n7$HQrr^h z4RSz(Mx>=z6hHQMl6%G3w3e@Bln(6zv`}jNuG$@^aNe7{o|^;-`OJ~{T%p+Rnr?#b zY=F&)1VC^LZeFYJZSw|)MeytQ-kfS67rUuFs}Xrk*;{xf0mFz@}MMo{WpPbih5 zb9K%Kj$ilwjY*6HL0X*paMR1-2U8EK-N071+6sOl`_DHxNFG!a-(0duhx1^g(*dtz zbDMbQF5}@E?fP;ujm=4V zoRBkGs8DdMxDE^#umu{oqxTI&yn#H|ama*KV~V&w?FB61UQTj!9f z0Z__!#i}z=n>{L59Ag^Ts;DdwD<{G1MB3K=3-32QUq_n6%uP2VKEIEslBK8Q@c9t4 zwBn$w57F0#1Jyh$a<*OPdT91Xy%t%cmx*aRYeIvhbc*6=>Q+23%4Eo;ygR%r{x*wk zOl^c?-^pxtKNm_>=Ew};{3~O<=UJHdK>%te^XR(rt(>+DoVS|)aT5-w-~!HYu7HD4 z``7wu?~Pw~1Yj&0@C?=mGe#ZfqRmvxKj-v*wEi{G*fg=m>XO>xQ`nftEVp*}{S*~o zlQ@v#@w-6F+|l{G+T->Q`>*e0MEhzYLMvoOOm)MtG^`{C+`g(php2+pnrwjM zt8sMa!bXBS{eBM zVPD2EX1u!~BM`cLs87yNkU2m;yxGsT4*0e!*|Qu4iLqsSC-r)l`I!N>HG@Wlymi zM6ji)`@@;xIXnMV|ipHxiM*|Q{85bfT zEcPj$IRnB;2?o$MwGeIlQ4-2M+cEA#Dm)tmS_fuleHh_f+F2`uO}BDj#0?jPMW@ru zgQ3Q*wCSOkY2Awa;^vVk>!=v8c)_YpfBsN92RDZAT7l7z$aQ-nE@a3w4aN!{DPqkU zOo7@xYunWQ$VL`7LG^Lz>;+gXd#ofM#3%M5M3->9ie2m~BQPYhXAc<0&?>)^B1^jj z?Kct41Bmrj^GCxth|n0ern3J2`l5ewWGXmbF}9&DWf*D$T9po!Y5vsvCj>8|%+B|u z@3aB?FgK-U5~zw9Nh^m0!g59liuxv!BBQc3!pO_&d1~4n*gYGFL}PhOsDkU zNAg$1-aZU0o)op;t#kJjA!_kT;<(EP(uJde3CL9 zYdlD7dFtgdRmXz%(e3%jtfG^KsIhh?@#hN`AI02}{G3v=-YdLLZ0e6a2j~NgiyQKz zg#-LL0#C*e0K&_jwBCmNb*|fc&-_yxJFaLV#mioiQF(eWEt`9{1$=}>aKce38HRT_ z@wK8MU!JQI2N9sk7(wL<6YGYqjZ8))+~vIF*P0fOzFt@VZW()M$20{;ArZ9PTz@qTUz>@D z>#Zjp*loP>d0jYo&7swWG!!=xrVS`U=|_m%in(nzGjjk8so9iXiqkJQ;LYWM0p|N< zf%UIg7W8uV?yl)*?}mmy(6w<)NXFyg7clgh6eA}py+|_wj;Pyg1UKuh$LwCzxU9`f z4f^kaun~WTn#abR%f|Gqbw)g)OaP<~3%3~Jl<*dPhFge?A-MI=%C^p4r^;QPU7i!F zX~94qL_NUybyZ}AYzCVWM8by-gkF?4BDyQQ4-qL{Z{w)(gHU)wu8FQDUJ9zE=r=ay zC%h%pZ)W4{?URb|ds)BUzg>1CgAtsOC~2-|F_#F`S*AN&|4C&XEN&HkVi>Faddr6h zhrfYBe5JZB;U}f&v3O`bm?;qk^s+O_D)Kl_URDFtB~S5SPN;d7iOarS1a`f6AYnWz zBv3H5BvgD6>{&F(nsr$8fp8m3s;m`$-j%2tkyk54A5SmdHkMo23O`$L%5HDBOw8Ca z0D@2CP4@)xftk1q|BWHIk$h7%@!uKJHLGIB;5gRO|dgrt((2jqgrOpH> z1ehW~RGybQ(Pd2?Eh_?l!@5(@kap$IZ;s8T#s%L(m7A#ltt&~`#gFsbU3_nfg7Lt;hh8xB^GJQ$5jGarfnoWO2;|*vcU7=S~9*hb4Wk!X%R3{KW{mi z$AEHwPShT`eR0&g*%nz@ZZIa9WbP9wsdKaYnLA)>9@Jra`&Y4bdYzbtr!?p;czKNq z`R9B8`p#Dn?p574wnxLO?S$j?lu<67;-3O`Mb!b8IT;d(AxbujNd5X(aW0*JHgfsx z*kX&WTYzS(pC9C?0JuKFG#l%Hba^!_tE-`Ia5Z6dmEDL_wNebyfoC7aMt9l9mexf# z(M14~K`>jVM;5(5$o@;$qwpK&{3gNXr0~I^`NqWu=rm-(imBxk zSJO9WczE3I!7@l*pezn{J6s0T~=Hy@{%JOWO<>tXPKA`R}JV%)l4ayDXE#W3w~iU<2< z8)-9q{C{f_$@X%H*5E2308W(hHf?YpcU$z%5x@YHr8R%&VEQkdH;}Y+{P%nu{ikhG zj`{vdFNp^;v)wliJ)jOys*<&mlcwq>jW@OgMfXj&h{rzQ z#yavwnCFURamAdY7x%Uvln@t3!}G$;$Fu9$y!^P% zE&r752=5FOqS+&~cMF@LcLuUK|5E7i+;%iL&;}xQDz!IHH%yz%S;Xie!6ciy ze)`P=wD8gvwa)+o+j`<+V-q(L=s`AR#rwq|?XmAAApZk=DIs0_DH%4}gG+fGe#DMD zM>im02PlEY+sS;h{b7tDXS1trP7hJwkM;@$QUcST4M+B}g%qYm9U!8O4Wykb%bwV7 zaMO{$h(P=d=Bd|cOFandsk(F4!f#jr_)s8PQ>xoJn+M1++f&!wYsEtJ)^CN3pptBI zp>9*{KMAR>RdV9QZ@w49RA<`Ey1MpO7U?eKSjDBsVF@>=*S*cU0z*HLpvSr1c9Q}! zb25((wY{}-#T?h$O=2rucSn`^xAR34P?Doy!HT=GV_k)*EgzHlF&^R=yc(MNPnB@S zVF*7{PyuQ-7gidXtM$s%#gi?P4PJ8~$Ixk9ljo>7JF6Twj6QR9q15R-g@5^CYZt=C z7`!gIlJz5uR*0t`cSkrI+Zekjd=(JYwZH-)oNX*5>4xyJ{jzYes(omG;6TNbL4i^T zRDr1pQd^J0G(h{M`a{CE8R1*zAX6hICU@Ly3!p^@uL5E!BgkQZcZfauE}Dog+Kd#j zCg72VlJm&&Q?>~aAJs<=@2ohcw$Kn1PK;eN*~@V_gHX%TKH>8Ul!{{SvtaxBIq?kS z4-!t=0OL1AiuHTVt&G1V;0F^;1GC;*bVCzn3Oiun#^kZv64aZ^WAt@rwd?)(1H7>= z00dsu?0}oWxS_WMYfk<6$dE0rq9$oV@>WEN@RL^j6}p|XcyZvA+8GK;O*V_V#udI|u7dDH}bJ*#j-5HK&-q1>9X%d9b`W zpnZoo!CP7rj-d$ryOOJcbt$LY76?ubz&yz0BAOh4rT(gm2TmkX`5-oOA7gy-u;aet zn77W^!Cj@ki=tZatf~78yCJE#;{ly_*IyLTF+YH5C?`0WIj}z16-%vY_jPda$StHe zurQFSH`(~P9G|!n1KiVr81D;jt#RXN^a46WsfV=gSzM}vA;n7uMAS%ej%ZdefGNOi zgoJJ|qaXO$rg@x|a8kv2N7Os>Omx-;1rx#qYj1?@}a75$_sq-t3Yfo}wE6*d%N4 ziVl7NN7*Da5jGMv_Cy!W{0<8@hHgc+!MDRW$i$~gEsFp-xsE2zT$A)&_&4n&%3g_F>}nh^j`fn z6aI0x2IA8k*lV2=&EKzLHE_{452d97dB#A}Ct}gt=g)my5s@$q)nuHZ`KXS;%C`7(N?tG^j8c`3bao{9)!3;kV%9i!Jt4GNfm-+&5&Bul7Kjqw)FAm zevqj%t^D{JN*CI)+|7|ch?Hr;7W~<9U!W94pGht`lT=B7w}DmgNFGWdn99*ZNM-)7 zhR`gaVyV>lj$j&3$TXC`PY^Ziv`sDIiIhJkGUP$bP>O# zZjTS}(Z%iKQj5<^PdUpyafcbELWiXFK2t=fVn9^Wkvild(7k$1$D7>N%4Oz6a%pcZCK%QBwGzB|+o ziSU^(WP2(gi=>GCrAqrRmpg;F7^oP!>+X?C{3A!t5LdpyK)`_E5#C_JtbCD{S>xn> zFECZ*PEb-BYnSxiTCS`IY-s)Nq2Xa5z*Ra9SU%jaNObO2pU;zY@7`zI&>&d(58LoClACU}k@Wy4>OY zJx+bZjVWU&q#s&209Z(^6vSY`ddfh@z=Bfict&8pAux2xIkw9`>`l~y-|rmnrmnF&hJG`V&b&Vc^nwP}Fn3mf;-2 z2?Kn+`{oS0ap(n6NA70jKyT#j#xhlpz1smdUkkF!#s^6X;EHK6uU>ItmvLo*D1Mpy z^SnKR94S^2zq;O?n=y79B#0-UwwbYFEK2zz#IDzM8&=qdIq%TD&@QAz%mnoL1a=87 zL@90-hnqPuQVy^6EVNTOY1(kJIuuGVssOh<&(h~gpMy)Urlh2%EGH2X;xM=x9T5_OI5-#{ z8XTC!ph>VGk@Sy#YOv5k&5t3GyUz09E#)5%*gc?#y$Vs?GVw#prhs zo;#OKW8pJ>u%yO4frngT_KJ1I`eNP5JVZek5h5D(yaV%A7NP&n<_*Deyo$sQd0zz*2w`j!^@d$G;vI#|{&!s=0IZ#KdP(y!cpej1eJW;i32!?h9E{JX zXZ6I?s$c)pIx@mS&+z^lbP)%BDmO+17um6my(x;l*cQ>IczPu`g&da#a@Za{GD?Cd zU5f+zfJJvvWNvE&!=yZe{jM5){P>GV+XMynG+kQhX;qD{M*5t397E6$CZ90>Z+eC{ zfRdZ#vKsp8Rjc1a8yS+C zn!@#}{37#z!n>9a%FhozLHP{nNvki019*RJ!Pvy3c%00PQRB9%+#>I-y?%kDZ!51J z#gZYn#x%j`Z^h8Yh$UCad%zHI*yTTNV3z^rlAy|9GhAwW50fot5?#^2G;oln0bxu( zM!vE!C^?v!fAog@>@8ckha@kU&5vq~!bGZ(oZq#8xfa|qd)qjD!8KaTtr2benu zTyi7$JiFCM(lSDVzHul3_a>8-zJ14E;j`5#Fu=)XatHIy#4NlA_POG*IGGZ% zWznO#<+F5mT4gzG_PUT;lAONkp8%9p`rMIm;LN)E49ohYoABK3%EXwm=BsJt(b^P? zP?x}R-Ntuz&X^owMpR{*yMLd04!~zMMt1NN;9$>Kp~_zYB?RcR<3!RPNyM-Dop%b8 zMFlPVNjKmQ@fhGD!b{+C4nd|NJ%=qg^_dVI;alDRJ~Yg0gDa#Is<8eyLp>iIIsu@% z+$?R8nmo&YNF-d<F zNpQ#b9y+N;u{WvTDdo$ddyslXtKdzBrI>B9Uq+K}rwfeB%n;s(j4DTR7%ju$r8=if zy-|;!QxG9pR)P3_Ti_#4c}P+#IuMS=h?!+#{emgCk@FKzoc8g64@)V(hDFj!vtJ4^ z6Vj1RIs);pu~H4N$=UXA0<`w@(j;ttGghbvq|SD)-U?o^DeLtQ7`C6Pu4~3B8M5vP zg?w{BNjq=5>gtITa5-2W`1ST1h0m22ER;9A7Q&f3%~fsF0@pS7Yp1I?rbXf{p-H`K ze2(Q{xH@OOWDu=nkPWpYYRcQb3qYMXir>J1$w(<$KUB=e>p&=W4`7Kb5?+E@Unf^8 zZd?Nd>%WS78PsSA+muTjhp&(8xu7o2pb(nc(Zx|FPCZkyP!jP%D3b^f3^kJ?WQkE- zE((XMi&Ir6@>=w%OBg!|#mCZUOzA1jG{qRulc^Xc>n9MFl;LeGJT(mwqLSV32f>6_ z_Q;)l-Fd2)aTM4l0Q6ORkIF_&5W|DYS0XQbK$7H#;tuLnu zjrT*&~A{P86{rp*$s04BJhq~UJu7bezl8XPTB z*Q2oGDicff;78Cqg3YjfZ-75uJ{e+%Opf-zv7b8FtPP=9>}!ZQ&mJdRY`s_g{%+Og zItidjGJ6TeNK|aP%y*C&19m&;d7d;GTW`l6d!{;=fnl#DJWgDmGE0G;Y+^@MDhAAO z^^FPd$WhG+z&kAmg9{O2pb+}o(~u4?Hgw@9=I6$e<~l`;BvmHwcSZX1FkvCod&vj> z^B6g)AZ5)$G`S0)>)g;oNWVIMrj$pELs3{P4GDgVA0=%zxX`AWw0>OTpLz*{SYA^8 zM}E2mn~=-%i#i*QWC{TcBj3_{KN*K2o<>czv>VsYW5l!+G))UJdnSpkXx%8=XvDK$ zcF$|>S)fE(mLyUORHj;Ny;`++G{`Y)MygSXXs+I2(<^H*DoA_qEfh}kkY2B%R9V5o zS&Y|0;(KZ0(j-mGk4j{izI|!4RaLM)Q5I0mvjK`xKTch_;O%b;OJcHH6L-8lI!K_H#RC`-44ZfUMyNpB;_hNoZe^^RE=aa^c3`XLah%ux#A34?7!7;CW%vc~uv$o*k& zyD8NN;G94{Cr-+CL;|Kz{9bFOwV3z=@fWV4jWKDcku#J4=-mm|5OaA#!9w}HloWMe z-?5P+CZZjiEbO(0?5Ob2#$t$B6zDDdu@BTYg=24rb9CjyDoHkE%@9&nIpmLYVZ5!L z>$}JvBXn6+D_C&w4b%nFkuw%FZGn=AttUCUe(epmc%{yt>lZVtwCtppBPibF0Ny~i z^G$R((zI1sbqHvx#$}6&^}j_URbnRiB7Y?J2fArI#+LA1)y9m{cN!3A-_T;*ZkpGL>iO&5Frw&nH;ga)B7yp|{Msi^ZdkWiq@uW;`y?l%fH*5rVI&ZA z@*^TgwCJ9YaF9W()0j_9jCmTaYwwM?3aV+jP}5lH-tyU2J$WOkoV}KDh8f;A3{hPu z2`7o1RD5bOTqqQ88u^o}cV&&$({{B7y{Uv3UDJg*p3RiCb*Trll85PWKTObHA^5TO zWCKVG@5u;T#>|JiQ^TZjkeq1!138 zL^}ES37>{D=F3Ih1X7tHiP`l^?EW@;09rWo$(z*_K^zT3*YgFg0wyIxl!9@3TmRUv zJm00jbp_~$ejUlyM*Vd8j~7a5U;L~uidSp1ABpL3GRn|$0FZvDW%SeQdQdR$tx2bk8nD`Tb^_`Mn6Zh=Bb0J+@@0a71IA452!B7}!H#NHZ_RD8fHiU&r4 zTku%yxZnsuieH6gfhYU=nik|J{`yA5K+S-uO?r1OyE-;o4evSHh&`XPmKGgeV}7pP z_{SE(Y7P#L7$D#H1cg)GEm;Z1ua9g!?!PYajCz6l#n-?kE2Jn6o;(KQ{xy5S(y6Hy zL&rO@#k`yjrm+29Sw7Lyj*NFpQ#yOltG|?Vt&zrV4hrS(0y;% zu<`d@b3KBqpd{LDRPK`WH^%c`CgxY^Q1WDN!o~FDV!&^=syp02J!X7Xl16^2b}n9m z=)MY>-~4Eyz?*vwICiN2I46P|qV!LKJ(i(wo^O3X@btS~U+wZqEPs8aQGJ`t_{~Qw z36Xs?Y{pH&NPXl`&oVlw3cJR_#rE_Nac6h=nT(}hzd}cxH0ROLFkc>tx-E3ROeODGs z=wWY*a6igPhJ%aE7v-w{a@Y%I{8ngylv@XQwiG+--$}^qtP!5t%bym1nMJcajXz>b z!7Q#gAYuM0nF=ayAlQjta#xao1!=PIN1-=@K2tMe~9-VDL} zVmKHdF0{NYzp>|Ln>UGTV5|2kPM<1dAAF}tjJ9-}9-Bb0^eq$iTAN51*(6as9yWa} zA*4ocDvIeNQ%HIg>8U2E1A+3$<+JTVjvW9x#dWoB%&Bb;Z%z^w{LZZI$?XtdBxK}W zcsbJA@o6hC=RBkA@FQe{e$-y6AFw*eE6l&tw4|f5SpSNFl2$^KWCE{pL+p4lFAxt0 zr%xFM%TRQ0&21XbvRqR7u3|Vu;ZT&q@3Y{)THVO|_T}f_eQ9fLVuHU8%BQ)(3VRKJ z^jO_3(_^Dk}jmieMt1yw?W{&;8VlbamqL8&J}dcxox*@BRG`30aA?`DNan^YtZ8ZRcxVqN;x$s z!bGJ^E1?=1TyFS2NEay1xi#Agr&V<@c=^Aai=-6d2guSU9Ly8#(V;5*UjCkqlQ; zLRv&+Da%>8Y*+8G%*#&XmTB^J$hpX)cx<}oforW+he|9pgly4{)3e%C2cy%!8%pvV8F zU{Y*&A6?3p=Cz%qZ-)TfUXtg&7Q`+d`va!aD;Dh;x(Ly4PT&VW_C88}37n*74z` zFW+FWb=4>`f4h=iJ{OTSK!qm9!C-c+ebR6>3Pu*+o;(?JdJwP^8X3DDdBjSVTj!7v zET@8017&k11Gjv#XrF`9RZEW2n5;{!411YjKMwW7eq+64E;CuQdNSmF={oYdi3G{; zhM7m!UP1CVl6=L`(m)PYh%3*>Tslt3OCxeEX(N06Ru~ftlr~uu-~-MCU#k`wiQGwn z)qr2C4ZQvrBLc^1RD+NGWzp+;lVyGg8DuxxuM3hWKvkL#@)ec2v)AB3VA~-`ZBH~? z?8EC<>@-dEY;2hlRjmUwibH<{c|M}Pp_~(?QG-F?DH3APFig-(%r*kfFH9U}&VBW~ zk7@lAGwvlnzJnHI00Dj_cwPK1@}t#x-z_O-I5eS-Z$ON}7bNmq9p{CWdPwOeR54M& za=k0$f;2I4S_`2Pu(ObHM|eYai+o5WQ1AR!UnN+liyvQUwU}*BqZJ-|OAnb|1l0~| zj3i?N>K)#J$jUW#LK6jZy*ll5VwHP?+wX^MfWqBL-zf&@gzcO~qSYELD&*Q#41e+)>lv%)VM) zt#J*-O9)xc(@cGLB{f6j))rCna;tpTUbUnR2ZQiM=kCYS)R7M~(tyOsU%cZtnF42O zsi~QJ?q2ptfJ+0#^brg0ejJ~U7guY<2PI|MUv3Zcl;HDk#tvw^czO|#{sgpnQwOcd zvNqoJv*^Wf9SuHctgVMU(=eizUSbtdY-r`6C2`FYi-0IC)HQlXH=?A zXLw_l@@4@k++`Yh?c~AZ*wVu{kM+jgjKm;SC#N22z+jR~s%^EpL#vBx9A+RQkYnR} zo(%WjY>awxB#k@JwLcuK7=##Zt>O!B6%67p%X`N$SlD{*2z;uYQ>gy_1&r3M>3ZR7 z=k*;c>VulTeue1JOD}k5iAse-%r)6XAdFQPZduip<)p!xkzZp4iD%63FfPsU@ z3VXRaKqon*%YBt1oNDlK=@xkO_qQ{I=WGHTeq_#3D6-(a1L5&GLvew{leP&r5}}t) zo2|KryV35aqsU;7ObKBtQ*tbe@ouHzNcIpetPcXT$fM;^8FSwUrxESTmep@5E67Y& zP%a`JFL;cqb=Bwo8y7a|I!o%{U}YG^ipcsZfOq=U?w#SS8n4PV+^H%;h0gHl%0%L# ztJ%c_P@o>S8vNQOUFV@p^YnRL$!#YgTrO*6uJb5c)0MFoEsiIgDvJzvoV4F|?G`s0 zW^%fQj5_JnXAF>_j8Si&Nhh(j+j3dfAGa{mg9bh=xewQMw_ambFQe6=PF-A=2VYnV z0aukbNJV!~exEBwhHpOohrX`eItbmZS^l|=RrBLsTQ`pWEM;bw%_tbwdqXu}k)aye zpeNHHEeYa-ewJBG9M{<%dJUONMlWMug>us)v-*|k>I~02jH)<|0P1+Ymkn0qakotOnGvo;ZEfIhQ`BM&J zBb=K;k6_+5WeeJ)U`(A_j80&V(xJwnVBBzq9GyK82tT~DsK@@TRwa=yqX^4I0FjE! z5C4Q&u`|5RNKH2J6r=X{?`A_xe-X=#gk@!44T)mgwbD@+o~}4v*_w4I4$a1r3XU1}A-N3Ru^J8uzBM znsM^NBi4Vb99a^_D>_qMCKu7LauuX+1*}G9nuOc=qsZeaPE0f9Dq5mXI>n`JMns!P z-G1CMELsx8eX$k&Sj@!zL1P^eESvzhxi>t)LnIk)K%y+$f@UpKOwqCI^^9$v!yEip zBM_lPwE0KYu>p+g3ODMuGr+-jDa>3u{b{+iJI? z*;QpCek89n$t}b>92iFLPmR8yr6g{QaMu#%de@h+p*+NG;BLdM3^=?ZooKhIm6M9S zw(x{8=Y`2<2TcPM78wF{!JuQ@q6V=w0F*R@3$WR`cPF~gzL2N-w*CDr<*i?=&*eQ~ zg^nI}Ew=|5DtPb{t&2HLh7m{1XFiXhzelwhw>cBVFY2Kam0Z3q*tNrf7-4d`U#L_`0>w?sq6jX5*y<;Tw`_T0)$R(2bR!K-^U zUo2(?t%O%O;i8} z$}!$qtEX6$>Zf~hoj0b6XbUtSC!O;q)#c4GrC@;e(lHzfQV+~`WurusLc)^du_C24 zs*)p7Jzi<1&M&kz2Ra;LJue>ZYzXA1-Wn_!halg^2zPLQeD0sjhwgRp1@jYetAtAx ze$PTUTIS?>&?|M%u;#D-P5${EAUhq1zRRELB|s|D(eVAZ2FHXn)h49qraWcJMW$ja9RJN6t+PC&;97= zpARAbU(s02ot-=^ojlzCi^h7HONovI_qiHS)#e*!W6e?anIT1@I6uvGoW z^ogjc{YRAjM-=-H{STA)2W6DK%$(F6KL39I5oIMm3o+XNlmD3(7gzs}YW@$3|A+nu z Date: Thu, 14 Nov 2024 08:23:12 -0500 Subject: [PATCH 09/24] generate cicd workflow for new transform Signed-off-by: Maroun Touma --- .../workflows/test-universal-web2parquet.yml | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 .github/workflows/test-universal-web2parquet.yml diff --git a/.github/workflows/test-universal-web2parquet.yml b/.github/workflows/test-universal-web2parquet.yml new file mode 100644 index 000000000..b46d32050 --- /dev/null +++ b/.github/workflows/test-universal-web2parquet.yml @@ -0,0 +1,133 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/universal/web2parquet + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/universal/web2parquet/**" + - "data-processing-lib/**" + - "!transforms/universal/web2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - ".make.*" + - "transforms/.make.transforms" + - "transforms/universal/web2parquet/**" + - "data-processing-lib/**" + - "!transforms/universal/web2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/universal/web2parquet + run: | + if [ -e "transforms/universal/web2parquet/Makefile" ]; then + make -C transforms/universal/web2parquet DOCKER=docker test-src + else + echo "transforms/universal/web2parquet/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/universal/web2parquet + run: | + if [ -e "transforms/universal/web2parquet/Makefile" ]; then + if [ -d "transforms/universal/web2parquet/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/universal/web2parquet DOCKER=docker test-image + else + echo "transforms/universal/web2parquet/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/universal/web2parquet/Makefile" ]; then + make -C transforms/universal/web2parquet publish + else + echo "transforms/universal/web2parquet/Makefile not found - publishing disabled for this transform." + fi From fcbcc0a231ceab29eba40bdcaca0659daebc583d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 14 Nov 2024 11:33:12 -0500 Subject: [PATCH 10/24] build image only if a Dockerfile is defined Signed-off-by: Maroun Touma --- transforms/.make.modules | 49 ++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/transforms/.make.modules b/transforms/.make.modules index 31e9121b0..36026cc5a 100644 --- a/transforms/.make.modules +++ b/transforms/.make.modules @@ -25,27 +25,54 @@ test:: .transforms.test-src test-image clean:: .transforms.clean -#image:: .transforms.ray-image +## We need to think how we want to do this going forward +set-versions:: + +## We need to think how we want to do this going forward +build:: + +image:: + @if [ -e Dockerfile ]; then \ + $(MAKE) image-default ; \ + else \ + echo "Skipping image for $(shell pwd) since no Dockerfile is present"; \ + fi + +publish:: + @if [ -e Dockerfile ]; then \ + $(MAKE) publish-default ; \ + else \ + echo "Skipping publish for $(shell pwd) since no Dockerfile is present"; \ + fi + +publish-image:: + @if [ -e Dockerfile ]; then \ + $(MAKE) publish-image-default ; \ + else \ + echo "Skipping publish-image for $(shell pwd) since no Dockerfile is present"; \ + fi + +test-image:: + @if [ -e Dockerfile ]; then \ + $(MAKE) test-image-default ; \ + else \ + echo "Skipping test-image for $(shell pwd) since no Dockerfile is present"; \ + fi test-src:: .transforms.test-src setup:: .transforms.setup -publish:: publish-image +publish-default:: publish-image -publish-image:: .transforms.publish-image-ray +publish-image-default:: .transforms.publish-image-ray -test-image:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean - -set-versions:: - -## We need to think how we want to do this going forward -build:: +test-image-default:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean build-lib-wheel: make -C $(REPOROOT)/data-processing-lib build-pkg-dist -image:: build-lib-wheel +image-default:: build-lib-wheel @$(eval LIB_WHEEL_FILE := $(shell find $(REPOROOT)/data-processing-lib/dist/*.whl)) rm -fr dist && mv $(REPOROOT)/data-processing-lib/dist . $(eval WHEEL_FILE_NAME := $(shell basename $(LIB_WHEEL_FILE))) @@ -60,6 +87,4 @@ image:: build-lib-wheel $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) rm -fr dist -publish:: publish-image - From b5031c98139741e9a576d538d2c34bccd1d95858 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 14 Nov 2024 15:19:17 -0500 Subject: [PATCH 11/24] Ignore page content as long as we get the right count Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/test/test_web2parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/web2parquet/test/test_web2parquet.py b/transforms/universal/web2parquet/test/test_web2parquet.py index 7d2e42e9b..da99d168d 100644 --- a/transforms/universal/web2parquet/test/test_web2parquet.py +++ b/transforms/universal/web2parquet/test/test_web2parquet.py @@ -41,7 +41,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: transform_config, input_dir, expected_dir, - [], # optional list of column names to ignore in comparing test-generated with expected. + ['contents'], # optional list of column names to ignore in comparing test-generated with expected. ) ) From 9ad3d18727a3113f12fecb8e7be774a07bcc3dee Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 14 Nov 2024 19:45:21 -0500 Subject: [PATCH 12/24] rename make.cicd.target Signed-off-by: Maroun Touma --- .../{.make.modules => .make.cicd.targets} | 23 +++++++++---------- transforms/pyproject.toml | 4 ++++ transforms/universal/web2parquet/Makefile | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) rename transforms/{.make.modules => .make.cicd.targets} (79%) diff --git a/transforms/.make.modules b/transforms/.make.cicd.targets similarity index 79% rename from transforms/.make.modules rename to transforms/.make.cicd.targets index 36026cc5a..69a5f54fd 100644 --- a/transforms/.make.modules +++ b/transforms/.make.cicd.targets @@ -1,13 +1,6 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. @@ -15,11 +8,17 @@ include $(REPOROOT)/transforms/.make.transforms ###################################################################### ## Default setting for TRANSFORM_RUNTIME uses folder name-- Old layout -TRANSFORM_RUNTIME=ray -TRANSFORM_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).$(TRANSFORM_RUNTIME).transform +TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).transform +TRANSFORM_RAY_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).ray.transform +TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).spark.transform + +venv:: .defaults.create-venv + source venv/bin/activate && $(PIP) install -e $(REPOROOT)/data-processing-lib[ray,spark] + source venv/bin/activate && $(PIP) install -e $(REPOROOT)/data-connector-lib + if [ -e requirements.txt ]; then \ + source venv/bin/activate && $(PIP) install -r requirements.txt; \ + fi; -venv:: .transforms.ray-venv - source venv/bin/activate && $(PYTHON) -m pip install $(REPOROOT)/data-connector-lib test:: .transforms.test-src test-image @@ -65,7 +64,7 @@ setup:: .transforms.setup publish-default:: publish-image -publish-image-default:: .transforms.publish-image-ray +publish-image-default:: .defaults.publish-image test-image-default:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 6e6cc2955..ac05c5884 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -78,6 +78,10 @@ resize = { file = ["universal/resize/python/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them + +#[tool.setuptools.package-dir] +#dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" + [options] package_dir = ["src","test"] diff --git a/transforms/universal/web2parquet/Makefile b/transforms/universal/web2parquet/Makefile index 8978c5d8b..e56e8b816 100644 --- a/transforms/universal/web2parquet/Makefile +++ b/transforms/universal/web2parquet/Makefile @@ -1,6 +1,6 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/transforms/.make.modules +include $(REPOROOT)/transforms/.make.cicd.targets # # This is intended to be included across the Makefiles provided within From c9c9779d2600d34de6afd75db12f3aa3ab8bfa02 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 11:31:46 -0500 Subject: [PATCH 13/24] updated notebook with example Signed-off-by: Maroun Touma --- .../universal/web2parquet/web2parquet.ipynb | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 transforms/universal/web2parquet/web2parquet.ipynb diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb new file mode 100644 index 000000000..0e58e315e --- /dev/null +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with the right release\n", + "\n", + "##### **** example: \n", + "```\n", + "python -m venv && source venv/bin/activate\n", + "pip install -r requirements.txt\n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install data-prep-toolkit\n", + "!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "614f0633-ad65-4994-9d61-0c21986ca3eb", + "metadata": {}, + "source": [ + "##### **** Note: must enable nested asynchronous io in a notebook as the crawler uses coroutine to speed up acquisition and downloads\n", + "#####\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b6c89ac7-6824-4d99-8120-7d5b150bd683", + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": {}, + "source": [ + "##### **** Configure the crawler parameter and invoke the transform function\n", + "##### \n", + "| parameter:type | Description |\n", + "| --- | --- |\n", + "| urls: list | list of seeds URL (i.e. ['https://thealliance.ai'] or ['www.ibm.com/docs','www.ibm.com/help']. The list can include any number of valid urls |\n", + "|depth: int | control crawling depth |\n", + "| downloads: int | number of downloads that are stored to the download folder |\n", + "| folder: str | folder where downloaded files are stored |" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "from dpk_web2parquet.transform import Web2Parquet\n", + "Web2Parquet(urls= ['https://thealliance.ai/'],\n", + " depth=2, \n", + " downloads=10,\n", + " folder='downloads').transform()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the downloaded files. The file name is the full URL where the / is replaced with an _ and the file extension is based on returned content-type." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['downloads/thealliance_ai_core-projects-ntia_request_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-advocacy_text.html',\n", + " 'downloads/thealliance_ai_blog-open-source-ai-demo-night-sf-2024_text.html',\n", + " 'downloads/thealliance_ai_contact_text.html',\n", + " 'downloads/thealliance_ai_core-projects-sb1047_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-foundation-models-datasets_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-hardware-enablement_text.html',\n", + " 'downloads/thealliance_ai_core-projects-trusted-evals_text.html',\n", + " 'downloads/thealliance_ai__text.html',\n", + " 'downloads/thealliance_ai_contribute_text.html',\n", + " 'downloads/thealliance_ai_community_text.html',\n", + " 'downloads/thealliance_ai_become-a-collaborator_text.html']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"downloads/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef6667e-71ed-4054-9382-55c6bb3fda70", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b77bbe94b51b8b63466fd515c8b92152c3f26695 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 11:34:26 -0500 Subject: [PATCH 14/24] updated notebook with example Signed-off-by: Maroun Touma --- .../universal/web2parquet/web2parquet.ipynb | 35 ++++--------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb index 0e58e315e..f31cf926f 100644 --- a/transforms/universal/web2parquet/web2parquet.ipynb +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -5,7 +5,7 @@ "id": "afd55886-5f5b-4794-838e-ef8179fb0394", "metadata": {}, "source": [ - "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with the right release\n", + "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release\n", "\n", "##### **** example: \n", "```\n", @@ -17,13 +17,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], "source": [ "%%capture\n", "!pip install data-prep-toolkit\n", + "!pip install data-prep-toolkit-transforms\n", "!pip install data-prep-connector" ] }, @@ -38,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "b6c89ac7-6824-4d99-8120-7d5b150bd683", "metadata": {}, "outputs": [], @@ -64,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -87,32 +88,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['downloads/thealliance_ai_core-projects-ntia_request_text.html',\n", - " 'downloads/thealliance_ai_focus-areas-advocacy_text.html',\n", - " 'downloads/thealliance_ai_blog-open-source-ai-demo-night-sf-2024_text.html',\n", - " 'downloads/thealliance_ai_contact_text.html',\n", - " 'downloads/thealliance_ai_core-projects-sb1047_text.html',\n", - " 'downloads/thealliance_ai_focus-areas-foundation-models-datasets_text.html',\n", - " 'downloads/thealliance_ai_focus-areas-hardware-enablement_text.html',\n", - " 'downloads/thealliance_ai_core-projects-trusted-evals_text.html',\n", - " 'downloads/thealliance_ai__text.html',\n", - " 'downloads/thealliance_ai_contribute_text.html',\n", - " 'downloads/thealliance_ai_community_text.html',\n", - " 'downloads/thealliance_ai_become-a-collaborator_text.html']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", "glob.glob(\"downloads/*\")" From 8e71177be2607e77f98b535097065023b18ede28 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 12:39:12 -0500 Subject: [PATCH 15/24] added readme.md Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 37 +++++++++++++++++++ .../universal/web2parquet/web2parquet.ipynb | 32 +++++++++++++--- 2 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 transforms/universal/web2parquet/README.md diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md new file mode 100644 index 000000000..0716914b3 --- /dev/null +++ b/transforms/universal/web2parquet/README.md @@ -0,0 +1,37 @@ +# Web Crawler to Parquet + +This tranforms crawls the web and download files in real-time. + +This first release of the transform, only accept the following 4 parameters. Additional releases will extend the functionality to allow the user to specify additional constraints such as mime-type, domain-focus, etc. + + +## parameters + +For configuring the crawl, users need to identify the follow paramters: + +| parameter:type | Description | +| --- | --- | +| urls:list | list of seeds URL (i.e. ['https://thealliance.ai'] or ['https://www.apache.org/projects','https://www.apache.org/foundation']). The list can include any number of valid urls that are not configured to block web crawlers | +|depth:int | control crawling depth | +| downloads:int | number of downloads that are stored to the download folder. Since the crawler operations happen asyncrhonous, the process can result in any 10 of the visited URLs being retrieved (i.e. consecutive runs can result in different files being downloaded) | +| folder:str | folder where downloaded files are stored. If the folder is not empty, new files are added or replace existing ones with the same URLs | + + +## Invoking the transform from a notebook + +In order to invoke the transfrom from the notebook, users must enable nested asynchronous io as follow: +import nest_asyncio +nest_asyncio.apply() + +In order to invoke the transform users need to import the transform class and call the transform() function: + +example: +``` +import nest_asyncio +nest_asyncio.apply() +from dpk_web2parquet.transform import Web2Parquet +Web2Parquet(urls= ['https://thealliance.ai/'], + depth=2, + downloads=10, + folder='downloads').transform() +```` \ No newline at end of file diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb index f31cf926f..2bd55f0bc 100644 --- a/transforms/universal/web2parquet/web2parquet.ipynb +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b6c89ac7-6824-4d99-8120-7d5b150bd683", "metadata": {}, "outputs": [], @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -88,10 +88,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['downloads/thealliance_ai_core-projects-ntia_request_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-advocacy_text.html',\n", + " 'downloads/thealliance_ai_blog-open-source-ai-demo-night-sf-2024_text.html',\n", + " 'downloads/thealliance_ai_contact_text.html',\n", + " 'downloads/thealliance_ai_core-projects-sb1047_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-foundation-models-datasets_text.html',\n", + " 'downloads/thealliance_ai_focus-areas-hardware-enablement_text.html',\n", + " 'downloads/thealliance_ai_core-projects-trusted-evals_text.html',\n", + " 'downloads/thealliance_ai__text.html',\n", + " 'downloads/thealliance_ai_contribute_text.html',\n", + " 'downloads/thealliance_ai_community_text.html',\n", + " 'downloads/thealliance_ai_become-a-collaborator_text.html']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"downloads/*\")" From ef7c57ddddb54d9c5d2aa37c2bbe63b3866564bf Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:10:35 -0500 Subject: [PATCH 16/24] fix typos Signed-off-by: Maroun Touma --- transforms/Makefile | 1 + transforms/README-list.md | 23 +++++++++++----------- transforms/pyproject.toml | 5 +++++ transforms/universal/web2parquet/README.md | 8 ++++---- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/transforms/Makefile b/transforms/Makefile index fc2dc6c6f..c04c72286 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -114,6 +114,7 @@ build-pkg-dist: fi \ done # Only needs to build the whl + git show --no-patch > src/data/gitshow.txt $(MAKE) BUILD_WHEEL_EXTRA_ARG=-w .defaults.build-dist -rm -fr src diff --git a/transforms/README-list.md b/transforms/README-list.md index 99885ad34..79b3ec50e 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -23,18 +23,19 @@ Note: This list includes the transforms that were part of the release starting w * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/python/README.md) * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/python/README.md) * language - * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/python/README.md) - * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/python/README.md) - * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/python/README.md) - * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/python/README.md) - * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/python/README.md) - * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/python/README.md) + * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/doc_chunk/python/README.md) + * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/doc_quality/python/README.md) + * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/lang_id/python/README.md) + * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) + * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/text_encoder/python/README.md) + * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pii_redactor/python/README.md) * universal - * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/python/README.md) - * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/python/README.md) - * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/python/README.md) - * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/tokenization/doc_chunk/python/README.md) - * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/python/README.md) + * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/ededup/python/README.md) + * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/filter/python/README.md) + * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/resize/python/README.md) + * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/tokenization/python/README.md) + * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/doc_id/python/README.md) + * [web2prquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/web2parquet/README.md) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index ac05c5884..6066ef9ba 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -78,6 +78,11 @@ resize = { file = ["universal/resize/python/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them +[tool.setuptools.package-data] +"*" = ["*.txt"] + +[tool.setuptools.packages.find] +where = ["src"] #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 0716914b3..6fc31ca5b 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -1,13 +1,13 @@ # Web Crawler to Parquet -This tranforms crawls the web and download files in real-time. +This tranform crawls the web and downloads files in real-time. -This first release of the transform, only accept the following 4 parameters. Additional releases will extend the functionality to allow the user to specify additional constraints such as mime-type, domain-focus, etc. +This first release of the transform, only accepts the following 4 parameters. Additional releases will extend the functionality to allow the user to specify additional constraints such as mime-type, domain-focus, etc. -## parameters +## Parameters -For configuring the crawl, users need to identify the follow paramters: +For configuring the crawl, users need to identify the follow parameters: | parameter:type | Description | | --- | --- | From 8c55ad8c4f17c87e71d230e9fe1a112e7e72bd64 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:15:31 -0500 Subject: [PATCH 17/24] More typos Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 6fc31ca5b..2bbcaa9cd 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -11,15 +11,15 @@ For configuring the crawl, users need to identify the follow parameters: | parameter:type | Description | | --- | --- | -| urls:list | list of seeds URL (i.e. ['https://thealliance.ai'] or ['https://www.apache.org/projects','https://www.apache.org/foundation']). The list can include any number of valid urls that are not configured to block web crawlers | +| urls:list | list of seeds URL (i.e., ['https://thealliance.ai'] or ['https://www.apache.org/projects','https://www.apache.org/foundation']). The list can include any number of valid urls that are not configured to block web crawlers | |depth:int | control crawling depth | -| downloads:int | number of downloads that are stored to the download folder. Since the crawler operations happen asyncrhonous, the process can result in any 10 of the visited URLs being retrieved (i.e. consecutive runs can result in different files being downloaded) | -| folder:str | folder where downloaded files are stored. If the folder is not empty, new files are added or replace existing ones with the same URLs | +| downloads:int | number of downloads that are stored to the download folder. Since the crawler operations happen asynchronously, the process can result in any 10 of the visited URLs being retrieved (i.e. consecutive runs can result in different files being downloaded) | +| folder:str | folder where downloaded files are stored. If the folder is not empty, new files are added or replace the existing ones with the same URLs | ## Invoking the transform from a notebook -In order to invoke the transfrom from the notebook, users must enable nested asynchronous io as follow: +In order to invoke the transfrom from a notebook, users must enable nested asynchronous io as follows: import nest_asyncio nest_asyncio.apply() From ba4b0a4988c6b91a83adc179dfdfbbf3a3ada8d6 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:17:08 -0500 Subject: [PATCH 18/24] more typos Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 2bbcaa9cd..36afd9251 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -11,7 +11,7 @@ For configuring the crawl, users need to identify the follow parameters: | parameter:type | Description | | --- | --- | -| urls:list | list of seeds URL (i.e., ['https://thealliance.ai'] or ['https://www.apache.org/projects','https://www.apache.org/foundation']). The list can include any number of valid urls that are not configured to block web crawlers | +| urls:list | list of seed URLs (i.e., ['https://thealliance.ai'] or ['https://www.apache.org/projects','https://www.apache.org/foundation']). The list can include any number of valid URLS that are not configured to block web crawlers | |depth:int | control crawling depth | | downloads:int | number of downloads that are stored to the download folder. Since the crawler operations happen asynchronously, the process can result in any 10 of the visited URLs being retrieved (i.e. consecutive runs can result in different files being downloaded) | | folder:str | folder where downloaded files are stored. If the folder is not empty, new files are added or replace the existing ones with the same URLs | From 6ea2e76936fb8fc6fa16c664864266cc88059799 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:38:06 -0500 Subject: [PATCH 19/24] more typos Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 36afd9251..ef093cef0 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -23,7 +23,7 @@ In order to invoke the transfrom from a notebook, users must enable nested async import nest_asyncio nest_asyncio.apply() -In order to invoke the transform users need to import the transform class and call the transform() function: +In order to invoke the transform, users need to import the transform class and call the transform() function: example: ``` From 670f381401fc913c207abf6995b88873e70a1cda Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:50:25 -0500 Subject: [PATCH 20/24] reference nested asyncio project Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index ef093cef0..2b01073f5 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -19,13 +19,9 @@ For configuring the crawl, users need to identify the follow parameters: ## Invoking the transform from a notebook -In order to invoke the transfrom from a notebook, users must enable nested asynchronous io as follows: -import nest_asyncio -nest_asyncio.apply() +In order to invoke the transfrom from a notebook, users must enable nested asynchronous ( https://pypi.org/project/nest-asyncio/ ), import the transform class and call the `transform()`function as shown in the example below: -In order to invoke the transform, users need to import the transform class and call the transform() function: -example: ``` import nest_asyncio nest_asyncio.apply() From 46b168a3dfb04cb644d645721ff6c099e4b5c6e3 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 16:59:07 -0500 Subject: [PATCH 21/24] fix typo Signed-off-by: Maroun Touma --- transforms/README-list.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/README-list.md b/transforms/README-list.md index 79b3ec50e..3e70b6b62 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -35,7 +35,7 @@ Note: This list includes the transforms that were part of the release starting w * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/resize/python/README.md) * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/tokenization/python/README.md) * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/doc_id/python/README.md) - * [web2prquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/web2parquet/README.md) + * [web2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/web2parquet/README.md) From 190969bb2430fde8a701187b5d683c829483a50c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 17:24:52 -0500 Subject: [PATCH 22/24] added instructions for installing the webcrawler module Signed-off-by: Maroun Touma --- transforms/universal/web2parquet/README.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 2b01073f5..e6fbc6822 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -7,7 +7,7 @@ This first release of the transform, only accepts the following 4 parameters. Ad ## Parameters -For configuring the crawl, users need to identify the follow parameters: +For configuring the crawl, users need to specify the follow parameters: | parameter:type | Description | | --- | --- | @@ -17,6 +17,25 @@ For configuring the crawl, users need to identify the follow parameters: | folder:str | folder where downloaded files are stored. If the folder is not empty, new files are added or replace the existing ones with the same URLs | +## Install the transform + +The transform can be installed directly from pypi and has a dependency on the data-prep-toolkit and the data-prep-connector + +``` +pip install data-prep-connector +pip install data-prep-toolkit>=0.2.2.dev2 +pip install data-prep-toolkit-transform[web2parquet]>=0.2.2.dev3 +``` + +If working from a fork in the git repo, from the root folder of the git repo, do the following: + +``` +cd transform/universal/web2parquet +make venv +source venv/bin/activate +pip install -r requirements.txt +``` + ## Invoking the transform from a notebook In order to invoke the transfrom from a notebook, users must enable nested asynchronous ( https://pypi.org/project/nest-asyncio/ ), import the transform class and call the `transform()`function as shown in the example below: From 96e46c7c87098520ebaedc6b4f44910ac1bad21f Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 17:31:47 -0500 Subject: [PATCH 23/24] added the module to the transform package Signed-off-by: Maroun Touma --- transforms/Makefile | 6 ++++++ transforms/pyproject.toml | 3 ++- transforms/universal/web2parquet/README.md | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/transforms/Makefile b/transforms/Makefile index c04c72286..3e8b9cfde 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -132,3 +132,9 @@ test-pkg-dist: publish-dist :: .defaults.publish-dist +publish-testpypi: + ## when installing from testpypi, make sure you install the dependecies first (pip install data-prep-toolkit) + ## and then use extra-url-index to install this package: + ## pip install --extra-index-url https://test.pypi.org/simple/ 'data-prep-toolkit-transforms[all]==x.x.x.devx' + twine upload --repository testpypi dist/* + diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 6066ef9ba..e05932933 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev2" +version = "0.2.2.dev3" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -74,6 +74,7 @@ profiler = { file = ["universal/profiler/python/requirements.txt"]} doc_id = { file = ["universal/doc_id/python/requirements.txt"]} filter = { file = ["universal/filter/python/requirements.txt"]} resize = { file = ["universal/resize/python/requirements.txt"]} +web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index e6fbc6822..1841403a7 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -7,7 +7,7 @@ This first release of the transform, only accepts the following 4 parameters. Ad ## Parameters -For configuring the crawl, users need to specify the follow parameters: +For configuring the crawl, users need to specify the following parameters: | parameter:type | Description | | --- | --- | From 4a5997006252daad2564084cc887b0c0212e030e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 15 Nov 2024 18:18:29 -0500 Subject: [PATCH 24/24] added requirements for web2parquet Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index e05932933..2357553e4 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -47,7 +47,8 @@ all = { file = [ "universal/profiler/python/requirements.txt", "universal/doc_id/python/requirements.txt", "universal/filter/python/requirements.txt", -"universal/resize/python/requirements.txt" +"universal/resize/python/requirements.txt", +"universal/web2parquet/requirements.txt" ]} # pyproject.toml must be in a parent and cannot be in sibling