Merge pull request #3662 from bensteinberg/fix-sampling

Fix sampling
harvard-lil · Nov 21, 2024 · 2fa1c8e · 2fa1c8e
2 parents fc48ba8 + 1c1fd71
commit 2fa1c8e
Showing 1 changed file with 24 additions and 8 deletions.
diff --git a/perma_web/tasks/dev.py b/perma_web/tasks/dev.py
@@ -1464,9 +1464,14 @@ def get_etag(bucket, path):
                 )
 
     # write to output file
-    filename = f"/tmp/sample-{n}-{datetime.isoformat(datetime.now())}.py"
+    timestamp = datetime.isoformat(datetime.now()).replace(":", "")
+    filename = f"/tmp/sample-{n}-{timestamp}.py"
     with open(filename, "w") as f:
-        f.write("import hashlib\nimport math\nfrom pathlib import Path\n\n")
+        f.write("import hashlib\n")
+        f.write("import math\n")
+        f.write("import sys\n")
+        f.write("from pathlib import Path\n")
+        f.write("from statistics import NormalDist\n\n")
         f.write(f"objects = {objects}\n")
         f.write(inspect.getsource(calculate_s3_etag))
         f.write(inspect.getsource(check_mirror))
@@ -1489,9 +1494,13 @@ def check_mirror():
     directories = sys.argv[2:]
 
     n = len(objects)  # noqa
+    if n * p < 10 or n - (n * p) < 10:
+        print(f"Sample size of {n} does not satisfy the success/failure condition for p of {p}.")  # noqa
+        return
+
     successes = 0
     failures = 0
-    blocksize = 2 ** 20
+    blocksize = 2 ** 20 * 8
 
     for o in objects:  # noqa
         success = 0
@@ -1502,32 +1511,39 @@ def check_mirror():
                     full_path = Path(d) / "generated" / o[archive]["path"]
                     if full_path.exists():
                         with open(full_path, "rb") as f:
-                            etag = calculate_s3_etag(f, blocksize)
+                            multipart = "-" in o[archive]["etag"]
+                            etag = calculate_s3_etag(f, blocksize, multipart)
                         if etag != o[archive]["etag"]:
                             failure += 1
                             print(
                                 f'etag mismatch for {o[archive]["path"]}'
                             )
                         else:
                             success += 1
-        if not success:
-            print(f'no file found for {o[archive]["path"]}')
         if failure or not success:
             failures += 1
+        elif not success:
+            failures += 1
+            print(f'no file found for {o[archive]["path"]}')
         else:
             successes += 1
 
-    assert successes + failures == n
-
+    # observed proportion
     p_hat = failures / n
 
+    # standard deviation
     sd = math.sqrt((p * (1 - p)) / n)  # noqa
 
+    # z-score
     z = (p_hat - p) / sd
 
+    # area under the standard Normal curve
+    probability = NormalDist().cdf(z)  # noqa
+
     print(f"From a sample of {n} links:")
     print(f"{successes} successes, {failures} failures")
     print(f"Expected proportion is {p}")
     print(f"Standard deviation is {sd}")
     print(f"Observed proportion is {p_hat}")
     print(f"z-score is {z}")
+    print(f"Chance of this result is {probability*100:.3f}%")