Skip to content

Commit

Permalink
chg: [crawler] tag domain by vanity
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Oct 10, 2024
1 parent 72f4733 commit 2ead8c2
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
9 changes: 9 additions & 0 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from lib import ail_logger
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames
from lib.objects import Etags
from lib.objects.Domains import Domain
Expand All @@ -40,6 +41,9 @@ def __init__(self):

self.tracker_yara = Tracker_Yara(queue=False)

self.vanity_tags = get_domain_vanity_tags()
print('vanity tags:', self.vanity_tags)

config_loader = ConfigLoader()

self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
Expand Down Expand Up @@ -271,7 +275,12 @@ def compute(self, capture):
# Origin + History + tags
if self.root_item:
self.domain.set_last_origin(parent_id)
# Vanity
self.domain.update_vanity_cluster()
domain_vanity = self.domain.get_vanity()
if domain_vanity in self.vanity_tags:
for tag in self.vanity_tags[domain_vanity]:
self.domain.add_tag(tag)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
Expand Down
18 changes: 18 additions & 0 deletions bin/lib/Tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,6 +1521,24 @@ def refresh_auto_push():

# --- TAG AUTO PUSH --- #

def get_domain_vanity_tags():
vanity = {}
try:
with open(os.path.join(os.environ['AIL_HOME'], 'files/vanity_tags')) as f:
ltags = json.load(f)
if ltags:
for tag in ltags:
if is_taxonomie_tag(tag) or is_galaxy_tag(tag):
for s_vanity in ltags[tag]:
if s_vanity not in vanity:
vanity[s_vanity] = []
vanity[s_vanity].append(tag)
except FileNotFoundError:
pass
except json.decoder.JSONDecodeError:
print('Error files/vanity_tags, Invalid JSON')
return vanity

###################################################################################
###################################################################################
###################################################################################
Expand Down
3 changes: 3 additions & 0 deletions bin/lib/objects/Domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ def add_language(self, language):
r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id)
r_crawler.sadd(f'domain:language:{self.id}', language)

def get_vanity(self, len_vanity=4):
return get_domain_vanity(self.id, len_vanity=len_vanity)

def update_vanity_cluster(self):
if self.get_domain_type() == 'onion':
update_vanity_cluster(self.id)
Expand Down

0 comments on commit 2ead8c2

Please sign in to comment.