diff --git a/docs/user/bots.rst b/docs/user/bots.rst index aa44c6a85..914970670 100644 --- a/docs/user/bots.rst +++ b/docs/user/bots.rst @@ -1243,6 +1243,38 @@ HTML Table Parser * `"type"`: set the `classification.type` statically, optional * `"html_parser"`: The HTML parser to use, by default "html.parser", can also be e.g. "lxml", have a look at https://www.crummy.com/software/BeautifulSoup/bs4/doc/ +JSON Custom Parser +^^^^^^^^^^^^^^^^^ + +**Configuration parameters** + +* `"json_data_format"`: Boolean, if list of data is within key of json object, optional. Default: false. +* `"json_data_key"`: Key of json object where data list is present. string should be flatten_key, optional. To be used in conjunction with `"json_data_format"`. Default: `""`. + E.g. + + .. code-block:: json + + "json_data_format": true, + "json_data_key": "data.ipdata" + + With above configuration, list of dict will be created from list present in json["data"]["ipdata"]. Each dict will then create atleast an event. +* `"splitlines"`: Boolean, spit multiline data into list, optional. Default: `"false"`. Either `"json_data_format"` or `"splitlines"` can be used.` +* `"translate_fields"`: A Dictionary to map harmonized field to flatten json key(separator: `"."`). these flatten key should be relative to `"json_data_key"`. + + .. code-block:: json + + "translate_fields": { + "source.url": "url", + "time.source": "lseen", + "extra.tags": "tags.str" + }, + + Above configuration will put value from "url" key to "source.url", "lseen" key to "time.source" and so on. + +* `"default_url_protocol"`: For URLs you can give a default protocol which will be pretended to the data. Default: `"http://"`. +* `"time_format"`: Optional. If `"timestamp"`, `"windows_nt"`, `"epoch_millis"`, `"from_format"`, `"from_format_midnight"`, `"utc_isoformat"` or `"fuzzy"` the time will be converted first. With the default `"null"` fuzzy time parsing will be used. +* `"type"`: set the `classification.type` statically, optional. Default: "c2server". + Key-Value Parser ^^^^^^^^^^^^^^^^ diff --git a/intelmq/bots/BOTS b/intelmq/bots/BOTS index 3103b3761..33271dc38 100644 --- a/intelmq/bots/BOTS +++ b/intelmq/bots/BOTS @@ -547,6 +547,19 @@ "splitlines": false } }, + "JSON Custom": { + "description": "JSON Custom Parser converts from a custom JSON-String into an Event", + "module": "intelmq.bots.parsers.json_custom.parser", + "parameters": { + "splitlines": false, + "multiple_msg_field": null, + "json_data_format": false, + "json_data_key": null, + "time_format": null, + "translate_fields": {}, + "type": "c2server" + } + }, "Key-Value": { "description": "Parse key=value strings.", "module": "intelmq.bots.parsers.key_value.parser", diff --git a/intelmq/bots/parsers/json_custom/__init__.py b/intelmq/bots/parsers/json_custom/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/intelmq/bots/parsers/json_custom/parser.py b/intelmq/bots/parsers/json_custom/parser.py new file mode 100644 index 000000000..896110b3d --- /dev/null +++ b/intelmq/bots/parsers/json_custom/parser.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +from dateutil.parser import parse + +from intelmq.lib.bot import ParserBot +from intelmq.lib.harmonization import DateTime +from intelmq.lib.message import Message +from intelmq.lib.utils import base64_decode +from intelmq.lib.harmonization import DateTime + + +class JSONCustomParserBot(ParserBot): + + def init(self): + self.time_format = getattr(self.parameters, "time_format", None) + if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys(): + raise InvalidArgument('time_format', got=self.time_format, + expected=list(DateTime.TIME_CONVERSIONS.keys()), + docs='https://intelmq.readthedocs.io/en/latest/user/Bots.html#json-custom-parser') + + self.json_data_format = getattr(self.parameters, 'json_data_format', False) + self.json_data_key = getattr(self.parameters, 'json_data_key', '') + self.multiple_msg_field = getattr(self.parameters, 'multiple_msg_field', None) + self.translate_fields = getattr(self.parameters, 'translate_fields', {}) + self.split_lines = getattr(self.parameters, 'splitlines', False) + self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://') + self.classification_type = getattr(self.parameters, 'type') + + def flatten_json(self, json_object): + out = {} + + def flatten(x, name='', separator='.'): + if type(x) is dict: + for a in x: + flatten(x[a], name + a + separator) + else: + out[name[:-1]] = x + + flatten(json_object) + return out + + def process(self): + + report = self.receive_message() + raw_report = base64_decode(report["raw"]) + + if self.json_data_format: + lines = Message.unserialize(raw_report)[self.json_data_key] + elif self.split_lines: + lines = raw_report.splitlines() + else: + lines = [raw_report] + + for line in lines: + if not line: + continue + + msg = Message.unserialize(line) if not self.json_data_format else line + flatten_msg = self.flatten_json(msg) + event_msg = {} + + for key in self.translate_fields: + data = flatten_msg.get(self.translate_fields[key]) + + if key in ["time.source", "time.destination"]: + try: + data = int(data) + except ValueError: + pass + data = DateTime.convert(data, format=self.time_format) + + elif key.endswith('.url'): + if not data: + continue + if '://' not in data: + data = self.default_url_protocol + data + + event_msg[key] = data + + multiple_msgs = [] + if self.multiple_msg_field in event_msg and type(event_msg[self.multiple_msg_field]) is list: + for value in event_msg[self.multiple_msg_field]: + new_msg = event_msg.copy() + new_msg[self.multiple_msg_field] = value + multiple_msgs.append(new_msg) + else: + multiple_msgs = [event_msg] + + for event_msg in multiple_msgs: + event = self.new_event(report) + event.update(event_msg) + + if self.classification_type and "classification.type" not in event: + event.add('classification.type', self.classification_type) + event['raw'] = Message.serialize(line) if self.json_data_format else line + + self.send_message(event) + + self.acknowledge_message() + + +BOT = JSONCustomParserBot diff --git a/intelmq/tests/bots/parsers/json_custom/__init__.py b/intelmq/tests/bots/parsers/json_custom/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/intelmq/tests/bots/parsers/json_custom/json_key_data.json b/intelmq/tests/bots/parsers/json_custom/json_key_data.json new file mode 100644 index 000000000..e6689f2ff --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/json_key_data.json @@ -0,0 +1,89 @@ +{ + "complete": false, + "count": 9632240, + "data": [ + { + "ip": "179.124.36.196", + "seen": true, + "classification": "malicious", + "spoofable": false, + "first_seen": "2020-01-13", + "last_seen": "2020-12-14", + "actor": "unknown", + "tags": [ + "SSH Scanner", + "SSH Worm" + ], + "cve": [], + "metadata": { + "country": "Brazil", + "country_code": "BR", + "city": "São Paulo", + "organization": "EQUINIX BRASIL", + "region": "São Paulo", + "rdns": "196.36.124.179.static.sp2.alog.com.br", + "asn": "AS16397", + "tor": false, + "os": "Linux 3.1-3.10", + "category": "isp", + "vpn": false, + "vpn_service": "" + }, + "raw_data": { + "scan": [ + { + "port": 22, + "protocol": "TCP" + }, + { + "port": 2222, + "protocol": "TCP" + } + ], + "web": {}, + "ja3": [] + } + }, + { + "ip": "189.86.227.150", + "seen": true, + "classification": "malicious", + "spoofable": false, + "first_seen": "2019-01-17", + "last_seen": "2020-12-14", + "actor": "unknown", + "tags": [ + "Eternalblue", + "SMB Scanner" + ], + "cve": [ + "CVE-2017-0144" + ], + "metadata": { + "country": "Brazil", + "country_code": "BR", + "city": "Sorocaba", + "organization": "CLARO S.A.", + "region": "São Paulo", + "rdns": "bkbrasil-g2-0-0-15122-iacc02.gna.embratel.net.br", + "asn": "AS4230", + "tor": false, + "os": "Windows 7/8", + "category": "isp", + "vpn": false, + "vpn_service": "" + }, + "raw_data": { + "scan": [ + { + "port": 445, + "protocol": "TCP" + } + ], + "web": {}, + "ja3": [] + } + } + ], + "message": "ok" +} diff --git a/intelmq/tests/bots/parsers/json_custom/multiple_msg.json b/intelmq/tests/bots/parsers/json_custom/multiple_msg.json new file mode 100644 index 000000000..36b60f393 --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/multiple_msg.json @@ -0,0 +1 @@ +{"domain": "kreditohneschufa48.de", "fseen": 1576368000, "lseen": 1607731200, "collect": 1607817600, "tags": {"str": ["spam"], "codes": [2]}, "resolved": {"ip": {"a": ["23.60.91.225", "23.200.237.225"], "alias": [], "cname": []}, "whois": {"created": "1970-01-01 00:00:00", "updated": "1970-01-01 00:00:00", "expires": "1970-01-01 00:00:00", "age": 0, "registrar": "unknown", "registrant": "unknown", "havedata": "false"}}, "score": {"total": 3, "src": 60.2, "tags": 0.75, "frequency": 0.07}, "fp": {"alarm": "false", "descr": ""}, "threat": [], "id": "d267c60f-5709-3698-9523-f727f42ab5c7", "title": "RST Threat feed. IOC: kreditohneschufa48.de", "description": "IOC with tags: spam"} diff --git a/intelmq/tests/bots/parsers/json_custom/sample.json b/intelmq/tests/bots/parsers/json_custom/sample.json new file mode 100644 index 000000000..dcf7a7fa2 --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/sample.json @@ -0,0 +1 @@ +{"url": "114.234.166.255:39436/mozi.a", "fseen": 1598918400, "lseen": 1601942400, "collect": 1602028800, "tags": {"str": ["malware"], "codes": [10]}, "score": {"total": 10, "src": 73.06, "tags": 0.89, "frequency": 0.58}, "resolved": {"status": 503}, "fp": {"alarm": "true", "descr": "Resource unavailable"}, "threat": [], "id": "987f5038-298f-37eb-a1d5-a17105f6b4b5", "title": "RST Threat feed. IOC: 114.234.166.255:39436/mozi.a", "description": "IOC with tags: malware"} diff --git a/intelmq/tests/bots/parsers/json_custom/test_json_key_data.py b/intelmq/tests/bots/parsers/json_custom/test_json_key_data.py new file mode 100644 index 000000000..c8d808c43 --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/test_json_key_data.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +import base64 +import os +import unittest + +import intelmq.lib.test as test +from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot + +with open(os.path.join(os.path.dirname(__file__), 'json_key_data.json'), 'rb') as fh: + RAW = base64.b64encode(fh.read()).decode() + +REPORT = {"feed.name": "Test Feed", + "raw": RAW, + "__type": "Report", + } +EVENT = {'__type': 'Event', + 'classification.type': 'malware', + 'extra.tags': ['SSH Scanner', 'SSH Worm'], + 'feed.name': 'Test Feed', + 'raw': 'eyJpcCI6ICIxNzkuMTI0LjM2LjE5NiIsICJzZWVuIjogdHJ1ZSwgImNsYXNzaWZpY2F0aW9' + 'uIjogIm1hbGljaW91cyIsICJzcG9vZmFibGUiOiBmYWxzZSwgImZpcnN0X3NlZW4iOiAiMj' + 'AyMC0wMS0xMyIsICJsYXN0X3NlZW4iOiAiMjAyMC0xMi0xNCIsICJhY3RvciI6ICJ1bmtub' + '3duIiwgInRhZ3MiOiBbIlNTSCBTY2FubmVyIiwgIlNTSCBXb3JtIl0sICJjdmUiOiBbXSwg' + 'Im1ldGFkYXRhIjogeyJjb3VudHJ5IjogIkJyYXppbCIsICJjb3VudHJ5X2NvZGUiOiAiQlI' + 'iLCAiY2l0eSI6ICJTXHUwMGUzbyBQYXVsbyIsICJvcmdhbml6YXRpb24iOiAiRVFVSU5JWC' + 'BCUkFTSUwiLCAicmVnaW9uIjogIlNcdTAwZTNvIFBhdWxvIiwgInJkbnMiOiAiMTk2LjM2L' + 'jEyNC4xNzkuc3RhdGljLnNwMi5hbG9nLmNvbS5iciIsICJhc24iOiAiQVMxNjM5NyIsICJ0' + 'b3IiOiBmYWxzZSwgIm9zIjogIkxpbnV4IDMuMS0zLjEwIiwgImNhdGVnb3J5IjogImlzcCI' + 'sICJ2cG4iOiBmYWxzZSwgInZwbl9zZXJ2aWNlIjogIiJ9LCAicmF3X2RhdGEiOiB7InNjYW' + '4iOiBbeyJwb3J0IjogMjIsICJwcm90b2NvbCI6ICJUQ1AifSwgeyJwb3J0IjogMjIyMiwgI' + 'nByb3RvY29sIjogIlRDUCJ9XSwgIndlYiI6IHt9LCAiamEzIjogW119LCAiX190eXBlIjog' + 'ImRpY3QifQ==', + 'time.source': '2020-12-14T00:00:00+00:00', + 'source.ip': '179.124.36.196' + } + + +class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase): + """ + A TestCase for a JSONCustomParserBot. + """ + + @classmethod + def set_bot(cls): + cls.bot_reference = JSONCustomParserBot + + def test_sample(self): + """ Test if correct Event has been produced. """ + self.input_message = REPORT + self.sysconfig = {"json_data_format": True, + "json_data_key": "data", + "type": "malware", + "time_format": "from_format_midnight|%Y-%m-%d", + "translate_fields": {"source.ip": "ip", + "time.source": "last_seen", + "extra.tags": "tags" + } + } + self.run_bot() + self.assertMessageEqual(0, EVENT) + + +if __name__ == '__main__': # pragma: no cover + unittest.main() diff --git a/intelmq/tests/bots/parsers/json_custom/test_multiple_msg.py b/intelmq/tests/bots/parsers/json_custom/test_multiple_msg.py new file mode 100644 index 000000000..61b949893 --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/test_multiple_msg.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +import base64 +import os +import unittest + +import intelmq.lib.test as test +from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot + +with open(os.path.join(os.path.dirname(__file__), 'multiple_msg.json'), 'rb') as fh: + RAW1 = base64.b64encode(fh.read()).decode() + +MULTILINE_REPORT = {"feed.name": "RSTThreats Domain Feed", + "raw": RAW1, + "__type": "Report", + } + +MULTIPLE_EVENT1 = {'__type': 'Event', + 'classification.type': 'malware', + 'extra.tags': ['spam'], + 'extra.threat_info': [], + 'feed.name': 'RSTThreats Domain Feed', + 'raw': 'eyJkb21haW4iOiAia3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImZzZWVuIjogMTU3NjM2O' + 'DAwMCwgImxzZWVuIjogMTYwNzczMTIwMCwgImNvbGxlY3QiOiAxNjA3ODE3NjAwLCAidG' + 'FncyI6IHsic3RyIjogWyJzcGFtIl0sICJjb2RlcyI6IFsyXX0sICJyZXNvbHZlZCI6IHs' + 'iaXAiOiB7ImEiOiBbIjIzLjYwLjkxLjIyNSIsICIyMy4yMDAuMjM3LjIyNSJdLCAiYWxp' + 'YXMiOiBbXSwgImNuYW1lIjogW119LCAid2hvaXMiOiB7ImNyZWF0ZWQiOiAiMTk3MC0wM' + 'S0wMSAwMDowMDowMCIsICJ1cGRhdGVkIjogIjE5NzAtMDEtMDEgMDA6MDA6MDAiLCAiZX' + 'hwaXJlcyI6ICIxOTcwLTAxLTAxIDAwOjAwOjAwIiwgImFnZSI6IDAsICJyZWdpc3RyYXI' + 'iOiAidW5rbm93biIsICJyZWdpc3RyYW50IjogInVua25vd24iLCAiaGF2ZWRhdGEiOiAi' + 'ZmFsc2UifX0sICJzY29yZSI6IHsidG90YWwiOiAzLCAic3JjIjogNjAuMiwgInRhZ3MiO' + 'iAwLjc1LCAiZnJlcXVlbmN5IjogMC4wN30sICJmcCI6IHsiYWxhcm0iOiAiZmFsc2UiLC' + 'AiZGVzY3IiOiAiIn0sICJ0aHJlYXQiOiBbXSwgImlkIjogImQyNjdjNjBmLTU3MDktMzY' + '5OC05NTIzLWY3MjdmNDJhYjVjNyIsICJ0aXRsZSI6ICJSU1QgVGhyZWF0IGZlZWQuIElP' + 'Qzoga3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImRlc2NyaXB0aW9uIjogIklPQyB3aXRoI' + 'HRhZ3M6IHNwYW0ifQ==', + 'source.fqdn': 'kreditohneschufa48.de', + 'source.ip': '23.60.91.225', + 'time.source': '2020-12-12T00:00:00+00:00' + } + +MULTIPLE_EVENT2 = MULTIPLE_EVENT1.copy() +MULTIPLE_EVENT2["source.ip"] = "23.200.237.225" + + +class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase): + """ + A TestCase for a JSONCustomParserBot. + """ + + @classmethod + def set_bot(cls): + cls.bot_reference = JSONCustomParserBot + + def test_multiple_msg(self): + """ Test if correct Event has been produced. """ + self.input_message = MULTILINE_REPORT + self.sysconfig = {"splitlines": True, + "type": "malware", + "time_format": "epoch_millis", + "multiple_msg_field": "source.ip", + "translate_fields": {"source.fqdn": "domain", + "time.source": "lseen", + "extra.tags": "tags.str", + "extra.threat_info": "threat", + "source.ip": "resolved.ip.a" + } + } + self.run_bot() + self.assertMessageEqual(0, MULTIPLE_EVENT1) + self.assertMessageEqual(1, MULTIPLE_EVENT2) + + +if __name__ == '__main__': # pragma: no cover + unittest.main() diff --git a/intelmq/tests/bots/parsers/json_custom/test_parser.py b/intelmq/tests/bots/parsers/json_custom/test_parser.py new file mode 100644 index 000000000..d5aa4635c --- /dev/null +++ b/intelmq/tests/bots/parsers/json_custom/test_parser.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import base64 +import os +import unittest + +import intelmq.lib.test as test +from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot + +with open(os.path.join(os.path.dirname(__file__), 'sample.json'), 'rb') as fh: + RAW = base64.b64encode(fh.read()).decode() + +REPORT = {"feed.name": "RSTThreats URL Feed", + "raw": RAW, + "__type": "Report", + } +EVENT = {'__type': 'Event', + 'classification.type': 'malware', + 'extra.tags': ['malware'], + 'extra.threat_info': [], + 'feed.name': 'RSTThreats URL Feed', + 'raw': 'eyJ1cmwiOiAiMTE0LjIzNC4xNjYuMjU1OjM5NDM2L21vemkuYSIsICJmc2VlbiI6IDE1OTg5MTg0MDA' + 'sICJsc2VlbiI6IDE2MDE5NDI0MDAsICJjb2xsZWN0IjogMTYwMjAyODgwMCwgInRhZ3MiOiB7InN0ci' + 'I6IFsibWFsd2FyZSJdLCAiY29kZXMiOiBbMTBdfSwgInNjb3JlIjogeyJ0b3RhbCI6IDEwLCAic3JjI' + 'jogNzMuMDYsICJ0YWdzIjogMC44OSwgImZyZXF1ZW5jeSI6IDAuNTh9LCAicmVzb2x2ZWQiOiB7InN0' + 'YXR1cyI6IDUwM30sICJmcCI6IHsiYWxhcm0iOiAidHJ1ZSIsICJkZXNjciI6ICJSZXNvdXJjZSB1bmF' + '2YWlsYWJsZSJ9LCAidGhyZWF0IjogW10sICJpZCI6ICI5ODdmNTAzOC0yOThmLTM3ZWItYTFkNS1hMT' + 'cxMDVmNmI0YjUiLCAidGl0bGUiOiAiUlNUIFRocmVhdCBmZWVkLiBJT0M6IDExNC4yMzQuMTY2LjI1N' + 'TozOTQzNi9tb3ppLmEiLCAiZGVzY3JpcHRpb24iOiAiSU9DIHdpdGggdGFnczogbWFsd2FyZSJ9', + 'time.source': '2020-10-06T00:00:00+00:00', + 'source.url': 'http://114.234.166.255:39436/mozi.a' + } + + +class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase): + """ + A TestCase for a JSONCustomParserBot. + """ + + @classmethod + def set_bot(cls): + cls.bot_reference = JSONCustomParserBot + + def test_sample(self): + """ Test if correct Event has been produced. """ + self.input_message = REPORT + self.sysconfig = {"splitlines": True, + "type": "malware", + "time_format": "epoch_millis", + "translate_fields": {"source.url": "url", + "time.source": "lseen", + "extra.tags": "tags.str", + "extra.threat_info": "threat" + } + } + self.run_bot() + self.assertMessageEqual(0, EVENT) + + +if __name__ == '__main__': # pragma: no cover + unittest.main()