Skip to content

Commit 54b1c22

Browse files
authored
Update SMS Spam URL (#346)
Signed-off-by: Melanie Buehler <[email protected]>
1 parent fccc229 commit 54b1c22

File tree

9 files changed

+17
-18
lines changed

9 files changed

+17
-18
lines changed

DATASETS.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This is a comprehensive list of public datasets used by this repository.
88
| [AG News (TFDS)](https://www.tensorflow.org/datasets/catalog/ag_news_subset) | TensorFlow | Text Classification |
99
| [Food101 (Torchvision)](https://pytorch.org/vision/stable/generated/torchvision.datasets.Food101.html#torchvision.datasets.Food101) | PyTorch | Image Classification |
1010
| [Food101 (TFDS)](https://www.tensorflow.org/datasets/catalog/food101) | TensorFlow | Image Classification |
11-
| [SMS Spam Collection](https://archive-beta.ics.uci.edu/dataset/228/sms+spam+collection) | PyTorch & TensorFlow | Text Classification |
11+
| [SMS Spam Collection](https://archive.ics.uci.edu/dataset/228/sms+spam+collection) | PyTorch & TensorFlow | Text Classification |
1212
| [TF Flowers (TFDS)](https://www.tensorflow.org/datasets/catalog/tf_flowers) | PyTorch & TensorFlow | Image Classification |
1313
| [Cats vs. Dogs (TFDS)](https://www.tensorflow.org/datasets/catalog/cats_vs_dogs) | TensorFlow | Image Classification |
1414
| [Country211 (Torchvision)](https://pytorch.org/vision/stable/generated/torchvision.datasets.Country211.html#torchvision.datasets.Country211) | PyTorch | Image Classification |

downloader/tests/test_dataset_download.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class TestDatasetDownload:
2828
Tests the dataset downloader with a temp download directory that is initialized and cleaned up
2929
"""
3030
URLS = {'sms_spam_collection':
31-
'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip',
31+
'https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip',
3232
'flowers':
3333
'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
3434
'imagenet_labels':
@@ -80,10 +80,9 @@ def test_catalog_download(self, dataset_name, catalog, split, kwargs, size):
8080
# Check that the directory is not empty
8181
assert os.listdir(self._dataset_dir) is not None
8282

83-
# Removing the SMS Spam Collection test because it's failing (404 error)
84-
# ['sms_spam_collection', URLS['sms_spam_collection'], 2],
8583
@pytest.mark.parametrize('dataset_name,url,num_contents',
86-
[['flowers', URLS['flowers'], 1],
84+
[['sms_spam_collection', URLS['sms_spam_collection'], 2],
85+
['flowers', URLS['flowers'], 1],
8786
['imagenet_labels', URLS['imagenet_labels'], 1],
8887
['peacock', URLS['peacock'], 1],
8988
['pennfudan', URLS['pennfudan'], 1]])

examples/cli/text_classification.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ The `--dataset-dir` argument is the path to the directory where your dataset is
1313
argument to specify a list of the classes and the `--delimiter` to specify the character that
1414
separates the two columns. If no `--delimiter` is specified, the CLI will default to use a comma (`,`).
1515

16-
This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/dataset/228/sms+spam+collection)
16+
This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/dataset/228/sms+spam+collection)
1717
dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text
1818
messages that are either being classified as `ham` or `spam`. The first column in the data file has
1919
the label (`ham` or `spam`) and the second column is the text of the SMS mesage. The string class
@@ -26,8 +26,8 @@ mkdir -p ${DATASET_DIR}
2626
mkdir -p ${OUTPUT_DIR}
2727

2828
# Download and extract the dataset
29-
wget -P ${DATASET_DIR} https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
30-
unzip ${DATASET_DIR}/smsspamcollection.zip
29+
wget -P ${DATASET_DIR} https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
30+
unzip ${DATASET_DIR}/sms+spam+collection.zip
3131

3232
# Make a copy of the .csv file with 'numerical' in the file name
3333
DATASET_FILE=SMSSpamCollection_numerical.csv

notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@
296296
"\n",
297297
"Instead of using a dataset from the Hugging Face dataset catalog, a custom dataset from your local system or a download can be used.\n",
298298
"\n",
299-
"In this example, we download the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
299+
"In this example, we download the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
300300
"```\n",
301301
"<ham or spam>\t<text>\n",
302302
"<ham or spam>\t<text>\n",
@@ -379,7 +379,7 @@
379379
"\n",
380380
"# Modify the variables below to use a different dataset or a csv file on your local system.\n",
381381
"# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
382-
"dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
382+
"dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
383383
"dataset_dir = os.path.join(dataset_dir, \"smsspamcollection\")\n",
384384
"csv_name = \"SMSSpamCollection\"\n",
385385
"delimiter = \"\\t\"\n",

notebooks/text_classification/pytorch_text_classification/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
This notebook demonstrates fine tuning [pretrained models from Hugging Face](https://huggingface.co/models)
44
using text classification datasets from the [Hugging Face Datasets catalog](https://huggingface.co/datasets) or
55
a custom dataset. The [IMDb Larget Movie Review dataset](https://ai.stanford.edu/~amaas/data/sentiment/) is used
6-
from the Hugging Face Datasets catalog, and the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection)
6+
from the Hugging Face Datasets catalog, and the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)
77
is used as an example of a custom dataset being loaded from a csv file.
88

99
The notebook uses

notebooks/text_classification/tfhub_text_classification/BERT_Binary_Text_Classification.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@
268268
"\n",
269269
"Instead of using a dataset from TensorFlow datasets, another dataset from your local system or a download can be used. \n",
270270
"\n",
271-
"In this example, we download the [SMS Spam Collection dataset](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
271+
"In this example, we download the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection). (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.) The zip file has a single tab-separated value file with two columns. The first column is the label (`ham` or `spam`) and the second column is the text of the SMS message:\n",
272272
"```\n",
273273
"<ham or spam>\t<text>\n",
274274
"<ham or spam>\t<text>\n",
@@ -357,7 +357,7 @@
357357
"\n",
358358
"# Modify the variables below to use a different dataset or a csv file on your local system.\n",
359359
"# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
360-
"dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
360+
"dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
361361
"dataset_directory = os.path.join(dataset_directory, \"smsspamcollection\")\n",
362362
"csv_name = \"SMSSpamCollection\"\n",
363363
"delimiter = \"\\t\"\n",

notebooks/text_classification/tlt_api_pyt_text_classification/TLT_PYT_Text_Classification.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@
120120
"\n",
121121
"If the .csv has more columns, the `select_cols` or `exclude_cols` parameters can be used to filter out which columns are parsed.\n",
122122
"\n",
123-
"This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
123+
"This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
124124
"\n",
125125
"When using your own dataset, update the path to your dataset directory, as well the other variables with properties about the dataset like the csv file name, class names, delimiter, header, and the map function (if string labels need to be translated into numerical values)."
126126
]
@@ -134,7 +134,7 @@
134134
"source": [
135135
"# Modify the variables below to use a different dataset or a csv file on your local system.\n",
136136
"# The csv_path variable should be pointing to a csv file with 2 columns (the label and the text)\n",
137-
"dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
137+
"dataset_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
138138
"dataset_dir = os.path.join(dataset_dir, \"sms_spam_collection\")\n",
139139
"csv_name = \"SMSSpamCollection\"\n",
140140
"delimiter = \"\\t\"\n",

notebooks/text_classification/tlt_api_tf_text_classification/TLT_TF_Text_Classification.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@
120120
"\n",
121121
"If the .csv has more columns, the `select_cols` or `exclude_cols` parameters can be used to filter out which columns are parsed.\n",
122122
"\n",
123-
"This example is downloading the [SMS Spam Collection](https://archive-beta.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
123+
"This example is downloading the [SMS Spam Collection](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) dataset, which has a tab separated value file in the .zip file. This dataset has labeled SMS text messages that are either being classified as `ham` or `spam`. The first column in the data file has the label (`ham` or `spam`) and the second column is the text of the SMS mesage. (Note: Please see this dataset's applicable license for terms and conditions. Intel Corporation does not own the rights to this data set and does not confer any rights to it.)\n",
124124
"\n",
125125
"When using your own dataset, update the path to your dataset directory, as well the other variables with properties about the dataset like the csv file name, class names, delimiter, header, and the map function (if string labels need to be translated into numerical values)."
126126
]
@@ -136,7 +136,7 @@
136136
},
137137
"outputs": [],
138138
"source": [
139-
"zip_file_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip\"\n",
139+
"zip_file_url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
140140
"sms_data_directory = os.path.join(dataset_dir, \"sms_spam_collection\")\n",
141141
"csv_file_name = \"SMSSpamCollection\"\n",
142142
"\n",

tests/tensorflow_tests/test_text_classification.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def label_map_func(x):
205205

206206
try:
207207
# Get the dataset
208-
zip_file_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
208+
zip_file_url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
209209
sms_data_directory = os.path.join(dataset_dir, "sms_spam_collection")
210210
csv_file_name = "SMSSpamCollection"
211211

0 commit comments

Comments
 (0)