Skip to content

Commit

Permalink
Add utils for PyFilesystem and config key matching (#351)
Browse files Browse the repository at this point in the history
This lays the foundation for [supporting multiple input dirs](#352).
  • Loading branch information
hqpho authored Nov 11, 2024
1 parent 5d14167 commit c3030d7
Show file tree
Hide file tree
Showing 6 changed files with 662 additions and 2 deletions.
2 changes: 1 addition & 1 deletion run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ function py_test {
pip3 install -r requirements.txt -q

echo -e "#### Running stats tests"
python3 -m pytest tests/stats/ -s
python3 -m pytest tests/ -s

deactivate
}
Expand Down
4 changes: 3 additions & 1 deletion simple/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ certifi==2023.7.22
charset-normalizer==3.2.0
cloud-sql-python-connector==1.4.3
freezegun==1.2.2
fs==2.4.16
fs-gcsfs==1.5.1
google-cloud-storage==2.11.0
httpx==0.26.0
idna==3.4
Expand All @@ -23,5 +25,5 @@ s2sphere==0.2.5
six==1.16.0
tomli==2.0.1
tzdata==2023.3
urllib3==2.0.4
urllib3==1.26.20
zipp==3.16.2
155 changes: 155 additions & 0 deletions simple/tests/util/file_match_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import tempfile
import unittest

from util.file_match import match
from util.filesystem import create_store


class TestFileMatch(unittest.TestCase):

def test_match_nested_dir(self):

store = create_store("temp://")
file = store.as_dir().open_file("path/to/foo.csv")

def yes(pattern: str) -> None:
self.assertTrue(match(file, pattern))

def no(pattern: str) -> None:
self.assertFalse(match(file, pattern))

def err(pattern: str) -> None:
with self.assertRaises(ValueError):
match(file, pattern)

# File path: temp://path/to/foo.csv
# Absolute path: /<generated>/path/to/foo.csv

# No slashes: match file name
yes("foo.csv")
no("bar.csv")
no("oo.csv")
no("oo*.csv")
yes("foo*.csv") # Wildcard can be no characters
yes("f*.csv")
yes("*.csv")
no("*.mcf")

# With protocol: match protocol and full path
yes("temp://path/to/**/*")
yes("temp://**/foo.csv")
yes("temp://**/*")
no("temp://foo.csv")
no("gs://**/*")

# Leading single slash: partial match not allowed, match relative to input dir
no("/foo.csv")
no("/to/foo.csv")
yes("/path/to/foo.csv")
no("/*")
yes("/**/*")
no("/*/foo.csv") # Single wildcard is a single level of nesting
no("/*/*/*/foo.csv") # Wrong depth with single wildcards
yes("/*/*/foo.csv") # Right depth with single wildcards
no("/*/*/oo.csv") # Right depth, wrong filename
yes("/*/*/*oo.csv") # Wildcard paths, wildcard in filename
yes("/**/foo.csv") # Double wildcard can be multiple levels of dirs
no("/**/oo.csv")
yes("/**/*/foo.csv")
yes("/*/**/foo.csv")
yes("/**/*/*/foo.csv")
yes("/*/**/*/foo.csv")
yes("/**/*/**/*/**/foo.csv") # gettin silly with it
no("/**/*/**/*/**/*/**/foo.csv")
no("/*/**/*/*/foo.csv")
no("/**/*/*/*/foo.csv")

# Leading double slash - partial match not allowed, match absolute path
abs_root = store.as_dir().syspath()
self.assertTrue(abs_root.startswith("/"))
self.assertFalse(abs_root.startswith("//"))
self.assertTrue(abs_root.endswith("/"))
no("//path/to/foo.csv")
yes(f"/{abs_root}path/to/foo.csv")
no("//to/foo.csv"
) # Leading slash means the match must be from the abs root
no("//*/foo.csv") # Single wildcard is a single level of nesting
no("//*/*/*/foo.csv") # Wrong depth with single wildcards
yes("//**/foo.csv") # Double wildcard can be multiple levels of dirs

no("temp://*.csv") # Single wildcard is a single level of nesting
no("gs://**.csv") # Wrong protocol

yes("path/to/foo.csv")
yes("to/foo.csv") # Partial match allowed
yes("*/foo.csv") # Wrong depth, but partial match allowed
yes("*/*/foo.csv")
yes("*/to/foo.csv")
yes("**/to/foo.csv")
yes("**/foo.csv")

# Double wildcards don't make sense in the name portion of a pattern.
err("temp://**.csv") # Use "temp://**/*.csv" instead
err("**.csv") # Use "*.csv" instead
err("//**/to/**.csv")

def test_os_abs_path(self):
with tempfile.TemporaryDirectory() as temp_dir:
self.assertTrue((str(temp_dir)).startswith("/"))
self.assertFalse((str(temp_dir)).startswith("//"))
store = create_store(temp_dir)
file = store.as_dir().open_file("path/to/foo.csv")

def yes(pattern: str) -> None:
self.assertTrue(match(file, pattern))

def no(pattern: str) -> None:
self.assertFalse(match(file, pattern))

yes("/" + str(temp_dir) + "/path/to/foo.csv")
no("//path/to/foo.csv")
yes("/path/to/foo.csv")
no("/to/foo.csv")
yes("to/foo.csv")

def test_match_in_input_dir(self):

store = create_store("temp://")
file = store.as_dir().open_file("foo.csv")

def yes(pattern: str) -> None:
self.assertTrue(match(file, pattern))

def no(pattern: str) -> None:
self.assertFalse(match(file, pattern))

def err(pattern: str) -> None:
with self.assertRaises(ValueError):
match(file, pattern)

# File path: temp://foo.csv

yes("foo.csv")
yes("foo*.csv") # Wildcard can be no characters
yes("*.csv")
no("*.mcf")

yes("/foo.csv")
yes("/*foo.csv")
no("/*/foo.csv")
no("*/foo.csv")

yes("**/foo.csv")
yes("/**/foo.csv")
yes("/*")
yes("/**/*")

no("/to/foo.csv") # Extra dir
yes("temp://**/foo.csv")
no("gs://**/foo.csv")
yes("temp://foo.csv")
yes("temp://*.csv")

err("**.csv")
err("/**.csv")
err("temp://**.csv")
109 changes: 109 additions & 0 deletions simple/tests/util/filesystem_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os
import tempfile
import unittest

from util.filesystem import create_store


class TestFilesystem(unittest.TestCase):

def test_create_store_dir_new(self):
with create_store("mem://", create_if_missing=True) as store:
self.assertTrue(store.isdir())
self.assertEqual(store.full_path(), "mem://")
self.assertEqual(store.as_dir().full_path(), "mem://")

def test_create_store_file_new(self):
with create_store("mem://foo.txt",
create_if_missing=True,
treat_as_file=True) as store:
self.assertFalse(store.isdir())
self.assertEqual(store.full_path(), "mem://foo.txt")
self.assertEqual(store.as_file().full_path(), "mem://foo.txt")

# Create subdir as well
with create_store("mem://path/to/foo.txt",
create_if_missing=True,
treat_as_file=True) as store:
self.assertFalse(store.isdir())
self.assertEqual(store.full_path(), "mem://path/to/foo.txt")
self.assertEqual(store.as_file().full_path(), "mem://path/to/foo.txt")

# Test that without create_if_missing, file opening fails
def test_missing_file(self):
with create_store("mem://") as store:
with self.assertRaises(FileNotFoundError):
store.as_dir().open_file("nonexistent.txt", create_if_missing=False)

def test_create_store_defaults_to_dir(self):
with create_store("mem://bar", create_if_missing=True) as store:
self.assertTrue(store.isdir())
self.assertEqual(store.full_path(), "mem://bar")

# Test create_store for a file that already exists
def test_create_store_file_existing(self):
with tempfile.TemporaryDirectory() as temp_dir:
# Create a file "foo.txt" in temp_dir
file_path = os.path.join(temp_dir, "foo.txt")
with open(file_path, "w") as f:
f.write("hello")

# treat_as_file=True
with create_store(file_path, create_if_missing=False,
treat_as_file=True) as store:
self.assertFalse(store.isdir())
self.assertEqual(store.full_path(), file_path)
self.assertEqual(store.as_file().full_path(), file_path)

# No treat_as_file param
with create_store(file_path, create_if_missing=False) as store:
self.assertFalse(store.isdir())
self.assertEqual(store.full_path(), file_path)
self.assertEqual(store.as_file().full_path(), file_path)

# Test create_store for a directory that already exists
def test_create_store_dir_existing(self):
with tempfile.TemporaryDirectory() as temp_dir:
with create_store(str(temp_dir), create_if_missing=False) as store:
self.assertEqual(store.full_path(), str(temp_dir))
self.assertTrue(store.isdir())
self.assertEqual(store.as_dir().full_path(), str(temp_dir))

# Test read and write methods on File
def test_file(self):
with create_store("mem://dir/foo.txt",
create_if_missing=True,
treat_as_file=True) as store:
file = store.as_file()
file.write("hello")
self.assertEqual(file.read(), "hello")
with file.read_string_io() as f_stringio:
self.assertEqual(f_stringio.read(), "hello")
self.assertEqual(file.size(), 5)
file.write_bytes(b"bytes")
self.assertEqual(file.read_bytes(), b"bytes")

def test_dir(self):
# Test open_dir and open_file methods on Dir
with create_store("mem://") as store:
dir = store.as_dir()
subdir = dir.open_dir("dir1/dir2")
self.assertEqual(subdir.full_path(), "mem://dir1/dir2")
file = subdir.open_file("dir3/foo.txt")
self.assertEqual(file.full_path(), "mem://dir1/dir2/dir3/foo.txt")
dir.open_file("bar.txt")
subdir.open_file("baz.txt")
all_file_paths = [file.full_path() for file in dir.all_files()]
self.assertListEqual(all_file_paths, [
"mem://bar.txt", "mem://dir1/dir2/baz.txt",
"mem://dir1/dir2/dir3/foo.txt"
])

# Test copy_to method on File
def test_copy_to(self):
with create_store("mem://") as store:
file1 = store.as_dir().open_file("foo.txt")
file1.write("hello")
file2 = store.as_dir().open_file("bar.txt")
file1.copy_to(file2)
self.assertEqual(file2.read(), "hello")
Loading

0 comments on commit c3030d7

Please sign in to comment.