Skip to content

Commit

Permalink
chg: [module] add CEDetector
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Oct 11, 2024
1 parent 2ead8c2 commit 3fb281f
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 1 deletion.
1 change: 1 addition & 0 deletions bin/lib/objects/Titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def create_title(content):
title.create(content)
return title


class Titles(AbstractDaterangeObjects):
"""
Titles Objects
Expand Down
10 changes: 10 additions & 0 deletions bin/lib/objects/abstract_daterange_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,19 @@ def __init__(self, obj_type, obj_class):
self.type = obj_type
self.obj_class = obj_class

################################################
################################################

def get_ids(self):
return r_object.smembers(f'{self.type}:all')

def get_iterator(self):
for obj_id in self.get_ids():
yield self.obj_class(obj_id)

################################################
################################################

# def get_ids_iterator(self):
# return r_object.sscan_iter(r_object, f'{self.type}:all')

Expand Down
122 changes: 122 additions & 0 deletions bin/modules/CEDetector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Onion Module
============================
This module extract url from item and returning only ones which are tor
related (.onion). All These urls are send to the crawler discovery queue.
Requirements
------------
*Need running Redis instances. (Redis)
"""
import os
import sys

from textblob import TextBlob
from nltk.tokenize import RegexpTokenizer

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader

class CEDetector(AbstractModule):
"""docstring for Onion module."""

def __init__(self, queue=True):
super(CEDetector, self).__init__(queue=queue)

config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")

self.csam_words = self.load_world_file('csam_words')
self.child_worlds = self.load_world_file('child_words')
self.porn_worlds = self.load_world_file('porn_words')

self.ce_tag = 'dark-web:topic="pornography-child-exploitation"'
self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
gaps=True, discard_empty=True)

def load_world_file(self, path):
words = set()
try:
with open(os.path.join(os.environ['AIL_HOME'], f'files/{path}')) as f:
content = f.read()
except FileNotFoundError:
content = ''
content = content.splitlines()
for line in content:
if line.startswith('#') or not line:
continue
word = line.split()
if word:
words.add(word[0])
return words

def compute(self, message): # TODO LIMIT TO DARKWEB ???
to_tag = False
content = self.obj.get_content().lower()
# print(content)

is_csam = False
is_child_word = False
is_porn_world = False
words = TextBlob(content, tokenizer=self.tokenizer).tokens
words = set(words)

for word in words:
print(word)
if word in self.csam_words:
is_csam = True
if word in self.child_worlds:
is_child_word = True
if word in self.porn_worlds:
is_porn_world = True
# PERF ???
# if is_child_word and is_porn_world:
# break

if is_csam:
to_tag = True
if is_child_word and is_porn_world:
to_tag = True

if to_tag:
# print(f'{content} DETECTED')
# print()
self.add_message_to_queue(message=self.ce_tag, queue='Tags')

return to_tag

def test_detection():
from lib import Tag
from lib.objects.Domains import Domain
from lib.objects.Titles import Title

not_detected = set()
tag = 'dark-web:topic="pornography-child-exploitation"'
tag_key = f'domain::{tag}'
for domain in Tag.get_obj_by_tag(tag_key):
dom = Domain(domain)
is_detected = False
for h in dom.get_correlation('title').get('title', []):
t = Title(h[1:])
title = t.get_content()
module.obj = title
if module.compute(''):
is_detected = True
if not is_detected:
not_detected.add(domain)
print(not_detected)


if __name__ == "__main__":
module = CEDetector()
module.run()
# test_detection()
8 changes: 7 additions & 1 deletion configs/modules.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
######## IMPORTERS ########

[Crawler]
publish = Importers,Tags,Images
publish = Importers,Tags,Images,Titles

[ZMQModuleImporter]
publish = Importers
Expand Down Expand Up @@ -172,6 +172,12 @@ publish = Item
subscribe = Images
publish = Item,Tags

######## TITLES ########

[CEDetector]
subscribe = Titles
publish = Tags

######## CORE ########

[Tags]
Expand Down

0 comments on commit 3fb281f

Please sign in to comment.