-
Notifications
You must be signed in to change notification settings - Fork 845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Updated key rotation to be more random #2504
Changes from 29 commits
b240008
58cecd4
570c09a
e6bf1f1
dc00428
487f7f9
0c040f7
30f777c
6d4b07e
05cb234
c53fa88
85711df
0a91df7
57bd526
4347415
48ea444
23da65b
453bf8b
024a389
060d462
833221b
18d1c46
437733f
0651965
6a6214c
19949b8
3388c76
be0dd4f
537e469
c438717
3dd65ff
cc2d62e
9619b6e
aa59483
8364aee
2bf0cd9
4e7475e
69e0b42
d7778ae
fc69726
f9ec47c
087ff33
fd2ef2f
dcc875e
bb56462
e200b28
24a4863
98ba73e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from augur.tasks.util.redis_list import RedisList | ||
from augur.application.db.session import DatabaseSession | ||
from augur.application.config import AugurConfig | ||
from sqlalchemy import func | ||
|
||
|
||
class NoValidKeysError(Exception): | ||
|
@@ -39,7 +40,7 @@ def __init__(self, session: DatabaseSession): | |
|
||
self.keys = self.get_api_keys() | ||
|
||
# self.logger.debug(f"Retrieved {len(self.keys)} github api keys for use") | ||
self.logger.info(f"Retrieved {len(self.keys)} github api keys for use") | ||
|
||
def get_random_key(self): | ||
"""Retrieves a random key from the list of keys | ||
|
@@ -71,9 +72,11 @@ def get_api_keys_from_database(self) -> List[str]: | |
from augur.application.db.models import WorkerOauth | ||
|
||
select = WorkerOauth.access_token | ||
# randomizing the order at db time | ||
#select.order_by(func.random()) | ||
where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'github'] | ||
|
||
return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] | ||
return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] | ||
|
||
|
||
def get_api_keys(self) -> List[str]: | ||
|
@@ -130,6 +133,18 @@ def get_api_keys(self) -> List[str]: | |
if not valid_keys: | ||
raise NoValidKeysError("No valid github api keys found in the config or worker oauth table") | ||
|
||
|
||
# shuffling the keys so not all processes get the same keys in the same order | ||
valid_now = valid_keys | ||
try: | ||
self.logger.debug(f'valid keys before shuffle: {valid_keys}') | ||
valid_keys = random.sample(valid_keys, len(valid_keys)) | ||
self.logger.debug(f'valid keys AFTER shuffle: {valid_keys}') | ||
except Exception as e: | ||
self.logger.debug(f'{e}') | ||
valid_keys = valid_now | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This only gets executed the first time the GithubApiKeyHandler is created after that the keys will be stored in Redis so this function will return in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's ok. What was happening is we'd always get them in table order, so by randomizing how redis is first populated we don't start with the same list, which makes the first hour of any restart go slower because it runs out of keys (at least going from previous, non-randomized configuration to the changes I made). |
||
pass | ||
|
||
return valid_keys | ||
|
||
def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
from augur.tasks.util.random_key_auth import RandomKeyAuth | ||
from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler | ||
from augur.application.db.session import DatabaseSession | ||
import random | ||
|
||
|
||
class GithubRandomKeyAuth(RandomKeyAuth): | ||
|
@@ -16,6 +17,7 @@ def __init__(self, session: DatabaseSession, logger): | |
|
||
# gets the github api keys from the database via the GithubApiKeyHandler | ||
github_api_keys = GithubApiKeyHandler(session).keys | ||
github_api_keys = random.sample(github_api_keys, len(github_api_keys)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the goal here is to randomize the order. If so random.shuffle should be used since Random.sample is used to get a random subset of a list. This will randomly order the list but it is less clear what the goal is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In short, we get more durable randomness from random.sample than random.shuffle ... random.shuffle reorders the original list, but it does this in memory, it doesn't reorder what is stored in redis. random.sample does a "better" job of getting us novel keys from a list every time. I tested both, and was reusing the same key about 3x as often with shuffle compared to random. From Here: https://blog.enterprisedna.co/python-shuffle-list/ It’s essential to note that the shuffle() function returns None and modifies the original list or array. Therefore, it’s unsuitable for cases where you must maintain the original list order. To return a new list containing elements from the original list without modifying it, you can use the sample() function from the random library: |
||
if not github_api_keys: | ||
print("Failed to find github api keys. This is usually because your key has expired") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -168,8 +168,10 @@ def pop(self, index: int = None): | |
""" | ||
|
||
if index is None: | ||
|
||
redis.rpop(self.redis_list_key) | ||
# This will get a random index from the list and remove it, | ||
# decreasing the likelihood of everyone using the same key all the time | ||
#redis.rpop(self.redis_list_key) | ||
redis.spop(self.redis_list_key) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This won't do anything because this isn't how we are getting the api keys from redis. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. spop does a randomized retrieval, while rpop does it in the order. The issue with using the order is that we will never know and don't want to try to know how big each job is before we do it. This balances out our key counts. I proved this by changing it and running it and seeing a HUGE gain in collection rate. |
||
else: | ||
# calls __delitem__ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is needed as this is just going to randomize the order of the keys that are cached in Redis the first time the GithubApiKeyHandler is created. So this will just make the keys in Redis stored in a different order
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's ok. What was happening is we'd always get them in table order, so by randomizing how redis is first populated we don't start with the same list, which makes the first hour of any restart go slower because it runs out of keys (at least going from previous, non-randomized configuration to the changes I made).