Update SMS Spam URL (#346)

mhbuehler · web-flow · commit 54b1c2238f26 · 2023-06-09T13:39:17.000-07:00
Signed-off-by: Melanie Buehler &lt;melanie.h.buehler@intel.com&gt;
diff --git a/DATASETS.md b/DATASETS.md
@@ -8,7 +8,7 @@ This is a comprehensive list of public datasets used by this repository.
 | [AG News (TFDS)](https://www.tensorflow.org/datasets/catalog/ag_news_subset) | TensorFlow | Text Classification |
 | [Food101 (Torchvision)](https://pytorch.org/vision/stable/generated/torchvision.datasets.Food101.html#torchvision.datasets.Food101) | PyTorch | Image Classification |
 | [Food101 (TFDS)](https://www.tensorflow.org/datasets/catalog/food101) | TensorFlow | Image Classification |
-| [SMS Spam Collection](https://archive-beta.ics.uci.edu/dataset/228/sms+spam+collection) | PyTorch & TensorFlow | Text Classification |
+| [SMS Spam Collection](https://archive.ics.uci.edu/dataset/228/sms+spam+collection) | PyTorch & TensorFlow | Text Classification |
 | [TF Flowers (TFDS)](https://www.tensorflow.org/datasets/catalog/tf_flowers) |  PyTorch & TensorFlow | Image Classification |
 | [Cats vs. Dogs (TFDS)](https://www.tensorflow.org/datasets/catalog/cats_vs_dogs) |  TensorFlow | Image Classification |
 | [Country211 (Torchvision)](https://pytorch.org/vision/stable/generated/torchvision.datasets.Country211.html#torchvision.datasets.Country211) | PyTorch | Image Classification |
diff --git a/downloader/tests/test_dataset_download.py b/downloader/tests/test_dataset_download.py
@@ -28,7 +28,7 @@ class TestDatasetDownload:
     Tests the dataset downloader with a temp download directory that is initialized and cleaned up
     """
     URLS = {'sms_spam_collection':
-            'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip',
+            'https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip',
             'flowers':
             'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
             'imagenet_labels':
@@ -80,10 +80,9 @@ def test_catalog_download(self, dataset_name, catalog, split, kwargs, size):
         # Check that the directory is not empty
         assert os.listdir(self._dataset_dir) is not None
 
-    # Removing the SMS Spam Collection test because it's failing (404 error)
-    # ['sms_spam_collection', URLS['sms_spam_collection'], 2],
     @pytest.mark.parametrize('dataset_name,url,num_contents',
-                             [['flowers', URLS['flowers'], 1],
+                             [['sms_spam_collection', URLS['sms_spam_collection'], 2],
+                              ['flowers', URLS['flowers'], 1],
                               ['imagenet_labels', URLS['imagenet_labels'], 1],
                               ['peacock', URLS['peacock'], 1],
                               ['pennfudan', URLS['pennfudan'], 1]])
diff --git a/examples/cli/text_classification.md b/examples/cli/text_classification.md
@@ -13,7 +13,7 @@ The `--dataset-dir` argument is the path to the directory where your dataset is
 argument to specify a list of the classes and the `--delimiter` to specify the character that
 separates the two columns. If no `--delimiter` is specified, the CLI will default to use a comma (`,`).
 
-This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/dataset/228/sms+spam+collection)
+This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/dataset/228/sms+spam+collection)
 dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text
 messages that are either being classified as `ham` or `spam`. The first column in the data file has
 the label (`ham` or `spam`) and the second column is the text of the SMS mesage. The string class
@@ -26,8 +26,8 @@ mkdir -p ${DATASET_DIR}
 mkdir -p ${OUTPUT_DIR}
 
 # Download and extract the dataset
-wget -P ${DATASET_DIR} https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
-unzip ${DATASET_DIR}/smsspamcollection.zip
+wget -P ${DATASET_DIR} https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
+unzip ${DATASET_DIR}/sms+spam+collection.zip
 
 # Make a copy of the .csv file with 'numerical' in the file name
 DATASET_FILE=SMSSpamCollection_numerical.csv
diff --git a/notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb b/notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb
@@ -296,7 +296,7 @@
     "\n",
     "Instead of using a dataset from the Hugging Face dataset catalog, a custom dataset from your local system or a download can be used.\n",
     "\n",
-    "In this example, we download the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
+    "In this example, we download the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
     "```\n",
     "<ham or spam>\t<text>\n",
     "<ham or spam>\t<text>\n",
@@ -379,7 +379,7 @@
     "\n",
     "# Modify the variables below to use a different dataset or a csv file on your local system.\n",
     "# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
-    "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
+    "dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
     "dataset_dir = os.path.join(dataset_dir, \"smsspamcollection\")\n",
     "csv_name = \"SMSSpamCollection\"\n",
     "delimiter = \"\\t\"\n",
diff --git a/notebooks/text_classification/pytorch_text_classification/README.md b/notebooks/text_classification/pytorch_text_classification/README.md
@@ -3,7 +3,7 @@
 This notebook demonstrates fine tuning [pretrained models from Hugging Face](https://huggingface.co/models)
 using text classification datasets from the [Hugging Face Datasets catalog](https://huggingface.co/datasets) or
 a custom dataset. The [IMDb Larget Movie Review dataset](https://ai.stanford.edu/~amaas/data/sentiment/) is used
-from the Hugging Face Datasets catalog, and the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection)
+from the Hugging Face Datasets catalog, and the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)
 is used as an example of a custom dataset being loaded from a csv file.
 
 The notebook uses
diff --git a/notebooks/text_classification/tfhub_text_classification/BERT_Binary_Text_Classification.ipynb b/notebooks/text_classification/tfhub_text_classification/BERT_Binary_Text_Classification.ipynb
@@ -268,7 +268,7 @@
     "\n",
     "Instead of using a dataset from TensorFlow datasets, another dataset from your local system or a download can be used. \n",
     "\n",
-    "In this example, we download the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
+    "In this example, we download the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
     "```\n",
     "<ham or spam>\t<text>\n",
     "<ham or spam>\t<text>\n",
@@ -357,7 +357,7 @@
     "\n",
     "# Modify the variables below to use a different dataset or a csv file on your local system.\n",
     "# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
-    "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
+    "dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
     "dataset_directory = os.path.join(dataset_directory, \"smsspamcollection\")\n",
     "csv_name = \"SMSSpamCollection\"\n",
     "delimiter = \"\\t\"\n",
diff --git a/notebooks/text_classification/tlt_api_pyt_text_classification/TLT_PYT_Text_Classification.ipynb b/notebooks/text_classification/tlt_api_pyt_text_classification/TLT_PYT_Text_Classification.ipynb
@@ -120,7 +120,7 @@
     "\n",
     "If the .csv has more columns, the `select_cols` or `exclude_cols` parameters can be used to filter out which columns are parsed.\n",
     "\n",
-    "This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
+    "This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
     "\n",
     "When using your own dataset, update the path to your dataset directory, as well the other variables with properties about the dataset like the csv file name, class names, delimiter, header, and the map function (if string labels need to be translated into numerical values)."
    ]
@@ -134,7 +134,7 @@
    "source": [
     "# Modify the variables below to use a different dataset or a csv file on your local system.\n",
     "# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
-    "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
+    "dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
     "dataset_dir = os.path.join(dataset_dir, \"sms_spam_collection\")\n",
     "csv_name = \"SMSSpamCollection\"\n",
     "delimiter = \"\\t\"\n",
diff --git a/notebooks/text_classification/tlt_api_tf_text_classification/TLT_TF_Text_Classification.ipynb b/notebooks/text_classification/tlt_api_tf_text_classification/TLT_TF_Text_Classification.ipynb
@@ -120,7 +120,7 @@
     "\n",
     "If the .csv has more columns, the `select_cols` or `exclude_cols` parameters can be used to filter out which columns are parsed.\n",
     "\n",
-    "This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
+    "This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
     "\n",
     "When using your own dataset, update the path to your dataset directory, as well the other variables with properties about the dataset like the csv file name, class names, delimiter, header, and the map function (if string labels need to be translated into numerical values)."
    ]
@@ -136,7 +136,7 @@
    },
    "outputs": [],
    "source": [
-    "zip_file_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
+    "zip_file_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
     "sms_data_directory = os.path.join(dataset_dir, \"sms_spam_collection\")\n",
     "csv_file_name = \"SMSSpamCollection\"\n",
     "\n",
diff --git a/tests/tensorflow_tests/test_text_classification.py b/tests/tensorflow_tests/test_text_classification.py
@@ -205,7 +205,7 @@ def label_map_func(x):
 
     try:
         # Get the dataset
-        zip_file_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
+        zip_file_url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
         sms_data_directory = os.path.join(dataset_dir, "sms_spam_collection")
         csv_file_name = "SMSSpamCollection"