Skip to content

Commit

Permalink
Merge pull request #3662 from bensteinberg/fix-sampling
Browse files Browse the repository at this point in the history
Fix sampling
  • Loading branch information
bensteinberg authored Nov 21, 2024
2 parents fc48ba8 + 1c1fd71 commit 2fa1c8e
Showing 1 changed file with 24 additions and 8 deletions.
32 changes: 24 additions & 8 deletions perma_web/tasks/dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,9 +1464,14 @@ def get_etag(bucket, path):
)

# write to output file
filename = f"/tmp/sample-{n}-{datetime.isoformat(datetime.now())}.py"
timestamp = datetime.isoformat(datetime.now()).replace(":", "")
filename = f"/tmp/sample-{n}-{timestamp}.py"
with open(filename, "w") as f:
f.write("import hashlib\nimport math\nfrom pathlib import Path\n\n")
f.write("import hashlib\n")
f.write("import math\n")
f.write("import sys\n")
f.write("from pathlib import Path\n")
f.write("from statistics import NormalDist\n\n")
f.write(f"objects = {objects}\n")
f.write(inspect.getsource(calculate_s3_etag))
f.write(inspect.getsource(check_mirror))
Expand All @@ -1489,9 +1494,13 @@ def check_mirror():
directories = sys.argv[2:]

n = len(objects) # noqa
if n * p < 10 or n - (n * p) < 10:
print(f"Sample size of {n} does not satisfy the success/failure condition for p of {p}.") # noqa
return

successes = 0
failures = 0
blocksize = 2 ** 20
blocksize = 2 ** 20 * 8

for o in objects: # noqa
success = 0
Expand All @@ -1502,32 +1511,39 @@ def check_mirror():
full_path = Path(d) / "generated" / o[archive]["path"]
if full_path.exists():
with open(full_path, "rb") as f:
etag = calculate_s3_etag(f, blocksize)
multipart = "-" in o[archive]["etag"]
etag = calculate_s3_etag(f, blocksize, multipart)
if etag != o[archive]["etag"]:
failure += 1
print(
f'etag mismatch for {o[archive]["path"]}'
)
else:
success += 1
if not success:
print(f'no file found for {o[archive]["path"]}')
if failure or not success:
failures += 1
elif not success:
failures += 1
print(f'no file found for {o[archive]["path"]}')
else:
successes += 1

assert successes + failures == n

# observed proportion
p_hat = failures / n

# standard deviation
sd = math.sqrt((p * (1 - p)) / n) # noqa

# z-score
z = (p_hat - p) / sd

# area under the standard Normal curve
probability = NormalDist().cdf(z) # noqa

print(f"From a sample of {n} links:")
print(f"{successes} successes, {failures} failures")
print(f"Expected proportion is {p}")
print(f"Standard deviation is {sd}")
print(f"Observed proportion is {p_hat}")
print(f"z-score is {z}")
print(f"Chance of this result is {probability*100:.3f}%")

0 comments on commit 2fa1c8e

Please sign in to comment.