certtools · kbrajneesh · Dec 14, 2020 · ghost · Dec 14, 2020 · kbrajneesh
diff --git a/docs/user/bots.rst b/docs/user/bots.rst
@@ -1243,6 +1243,38 @@ HTML Table Parser
  * `"type"`: set the `classification.type` statically, optional
  * `"html_parser"`: The HTML parser to use, by default "html.parser", can also be e.g. "lxml", have a look at https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 
+JSON Custom Parser
+^^^^^^^^^^^^^^^^^
+
+**Configuration parameters**
+
+* `"json_data_format"`: Boolean, if list of data is within key of json object, optional. Default: false.
+* `"json_data_key"`: Key of json object where data list is present. string should be flatten_key, optional. To be used in conjunction with `"json_data_format"`. Default: `""`.
+   E.g.
+
+  .. code-block:: json
+
+     "json_data_format": true,
+     "json_data_key": "data.ipdata"
+
+  With above configuration, list of dict will be created from list present in json["data"]["ipdata"]. Each dict will then create atleast an event.
+* `"splitlines"`: Boolean, spit multiline data into list, optional. Default: `"false"`. Either `"json_data_format"` or `"splitlines"` can be used.`
+* `"translate_fields"`: A Dictionary to map harmonized field to flatten json key(separator: `"."`). these flatten key should be relative to `"json_data_key"`.
+
+  .. code-block:: json
+
+     "translate_fields": {
+         "source.url": "url",
+         "time.source": "lseen",
+         "extra.tags": "tags.str"
+     },
+
+  Above configuration will put value from "url" key to "source.url", "lseen" key to "time.source" and so on.
+
+* `"default_url_protocol"`: For URLs you can give a default protocol which will be pretended to the data. Default: `"http://"`.
+* `"time_format"`: Optional. If `"timestamp"`, `"windows_nt"`, `"epoch_millis"`, `"from_format"`, `"from_format_midnight"`, `"utc_isoformat"` or  `"fuzzy"`  the time will be converted first. With the default `"null"` fuzzy time parsing will be used.
+* `"type"`: set the `classification.type` statically, optional. Default: "c2server".
+
 Key-Value Parser
 ^^^^^^^^^^^^^^^^
 

diff --git a/intelmq/bots/BOTS b/intelmq/bots/BOTS
@@ -547,6 +547,19 @@
                 "splitlines": false
             }
         },
+        "JSON Custom": {
+            "description": "JSON Custom Parser converts from a custom JSON-String into an Event",
+            "module": "intelmq.bots.parsers.json_custom.parser",
+            "parameters": {
+                "splitlines": false,
+                "multiple_msg_field": null,
+                "json_data_format": false,
+                "json_data_key": null,
+                "time_format": null,
+                "translate_fields": {},
+                "type": "c2server"
+            }
+        },
         "Key-Value": {
             "description": "Parse key=value strings.",
             "module": "intelmq.bots.parsers.key_value.parser",

diff --git a/intelmq/bots/parsers/json_custom/__init__.py b/intelmq/bots/parsers/json_custom/__init__.py
diff --git a/intelmq/bots/parsers/json_custom/parser.py b/intelmq/bots/parsers/json_custom/parser.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+from dateutil.parser import parse
+
+from intelmq.lib.bot import ParserBot
+from intelmq.lib.harmonization import DateTime
+from intelmq.lib.message import Message
+from intelmq.lib.utils import base64_decode
+from intelmq.lib.harmonization import DateTime
+
+
+class JSONCustomParserBot(ParserBot):
+
+    def init(self):
+        self.time_format = getattr(self.parameters, "time_format", None)
+        if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
+            raise InvalidArgument('time_format', got=self.time_format,
+                                  expected=list(DateTime.TIME_CONVERSIONS.keys()),
+                                  docs='https://intelmq.readthedocs.io/en/latest/user/Bots.html#json-custom-parser')
+
+        self.json_data_format = getattr(self.parameters, 'json_data_format', False)
+        self.json_data_key = getattr(self.parameters, 'json_data_key', '')
+        self.multiple_msg_field = getattr(self.parameters, 'multiple_msg_field', None)
+        self.translate_fields = getattr(self.parameters, 'translate_fields', {})
+        self.split_lines = getattr(self.parameters, 'splitlines', False)
+        self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://')
+        self.classification_type = getattr(self.parameters, 'type')
+
+    def flatten_json(self, json_object):
+        out = {}
+
+        def flatten(x, name='', separator='.'):
+            if type(x) is dict:
+                for a in x:
+                    flatten(x[a], name + a + separator)
+            else:
+                out[name[:-1]] = x
+
+        flatten(json_object)
+        return out
+
+    def process(self):
+
+        report = self.receive_message()
+        raw_report = base64_decode(report["raw"])
+
+        if self.json_data_format:
+            lines = Message.unserialize(raw_report)[self.json_data_key]
+        elif self.split_lines:
+            lines = raw_report.splitlines()
+        else:
+            lines = [raw_report]
+
+        for line in lines:
+            if not line:
+                continue
+
+            msg = Message.unserialize(line) if not self.json_data_format else line
+            flatten_msg = self.flatten_json(msg)
+            event_msg = {}
+
+            for key in self.translate_fields:
+                data = flatten_msg.get(self.translate_fields[key])
+
+                if key in ["time.source", "time.destination"]:
+                    try:
+                        data = int(data)
+                    except ValueError:
+                        pass
+                    data = DateTime.convert(data, format=self.time_format)
+
+                elif key.endswith('.url'):
+                    if not data:
+                        continue
+                    if '://' not in data:
+                        data = self.default_url_protocol + data
+
+                event_msg[key] = data
+
+            multiple_msgs = []
+            if self.multiple_msg_field in event_msg and type(event_msg[self.multiple_msg_field]) is list:
+                for value in event_msg[self.multiple_msg_field]:
+                    new_msg = event_msg.copy()
+                    new_msg[self.multiple_msg_field] = value
+                    multiple_msgs.append(new_msg)
+            else:
+                multiple_msgs = [event_msg]
+
+            for event_msg in multiple_msgs:
+                event = self.new_event(report)
+                event.update(event_msg)
+
+                if self.classification_type and "classification.type" not in event:
+                    event.add('classification.type', self.classification_type)
+                event['raw'] = Message.serialize(line) if self.json_data_format else line
+
+                self.send_message(event)
+
+        self.acknowledge_message()
+
+
+BOT = JSONCustomParserBot
diff --git a/intelmq/tests/bots/parsers/json_custom/__init__.py b/intelmq/tests/bots/parsers/json_custom/__init__.py
diff --git a/intelmq/tests/bots/parsers/json_custom/json_key_data.json b/intelmq/tests/bots/parsers/json_custom/json_key_data.json
@@ -0,0 +1,89 @@
+{
+    "complete": false,
+    "count": 9632240,
+    "data": [
+        {
+            "ip": "179.124.36.196",
+            "seen": true,
+            "classification": "malicious",
+            "spoofable": false,
+            "first_seen": "2020-01-13",
+            "last_seen": "2020-12-14",
+            "actor": "unknown",
+            "tags": [
+                "SSH Scanner",
+                "SSH Worm"
+            ],
+            "cve": [],
+            "metadata": {
+                "country": "Brazil",
+                "country_code": "BR",
+                "city": "São Paulo",
+                "organization": "EQUINIX BRASIL",
+                "region": "São Paulo",
+                "rdns": "196.36.124.179.static.sp2.alog.com.br",
+                "asn": "AS16397",
+                "tor": false,
+                "os": "Linux 3.1-3.10",
+                "category": "isp",
+                "vpn": false,
+                "vpn_service": ""
+            },
+            "raw_data": {
+                "scan": [
+                    {
+                        "port": 22,
+                        "protocol": "TCP"
+                    },
+                    {
+                        "port": 2222,
+                        "protocol": "TCP"
+                    }
+                ],
+                "web": {},
+                "ja3": []
+            }
+        },
+        {
+            "ip": "189.86.227.150",
+            "seen": true,
+            "classification": "malicious",
+            "spoofable": false,
+            "first_seen": "2019-01-17",
+            "last_seen": "2020-12-14",
+            "actor": "unknown",
+            "tags": [
+                "Eternalblue",
+                "SMB Scanner"
+            ],
+            "cve": [
+                "CVE-2017-0144"
+            ],
+            "metadata": {
+                "country": "Brazil",
+                "country_code": "BR",
+                "city": "Sorocaba",
+                "organization": "CLARO S.A.",
+                "region": "São Paulo",
+                "rdns": "bkbrasil-g2-0-0-15122-iacc02.gna.embratel.net.br",
+                "asn": "AS4230",
+                "tor": false,
+                "os": "Windows 7/8",
+                "category": "isp",
+                "vpn": false,
+                "vpn_service": ""
+            },
+            "raw_data": {
+                "scan": [
+                    {
+                        "port": 445,
+                        "protocol": "TCP"
+                    }
+                ],
+                "web": {},
+                "ja3": []
+            }
+        }
+    ],
+    "message": "ok"
+}
diff --git a/intelmq/tests/bots/parsers/json_custom/multiple_msg.json b/intelmq/tests/bots/parsers/json_custom/multiple_msg.json
@@ -0,0 +1 @@
+{"domain": "kreditohneschufa48.de", "fseen": 1576368000, "lseen": 1607731200, "collect": 1607817600, "tags": {"str": ["spam"], "codes": [2]}, "resolved": {"ip": {"a": ["23.60.91.225", "23.200.237.225"], "alias": [], "cname": []}, "whois": {"created": "1970-01-01 00:00:00", "updated": "1970-01-01 00:00:00", "expires": "1970-01-01 00:00:00", "age": 0, "registrar": "unknown", "registrant": "unknown", "havedata": "false"}}, "score": {"total": 3, "src": 60.2, "tags": 0.75, "frequency": 0.07}, "fp": {"alarm": "false", "descr": ""}, "threat": [], "id": "d267c60f-5709-3698-9523-f727f42ab5c7", "title": "RST Threat feed. IOC: kreditohneschufa48.de", "description": "IOC with tags: spam"}
diff --git a/intelmq/tests/bots/parsers/json_custom/sample.json b/intelmq/tests/bots/parsers/json_custom/sample.json
@@ -0,0 +1 @@
+{"url": "114.234.166.255:39436/mozi.a", "fseen": 1598918400, "lseen": 1601942400, "collect": 1602028800, "tags": {"str": ["malware"], "codes": [10]}, "score": {"total": 10, "src": 73.06, "tags": 0.89, "frequency": 0.58}, "resolved": {"status": 503}, "fp": {"alarm": "true", "descr": "Resource unavailable"}, "threat": [], "id": "987f5038-298f-37eb-a1d5-a17105f6b4b5", "title": "RST Threat feed. IOC: 114.234.166.255:39436/mozi.a", "description": "IOC with tags: malware"}
diff --git a/intelmq/tests/bots/parsers/json_custom/test_json_key_data.py b/intelmq/tests/bots/parsers/json_custom/test_json_key_data.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+import base64
+import os
+import unittest
+
+import intelmq.lib.test as test
+from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot
+
+with open(os.path.join(os.path.dirname(__file__), 'json_key_data.json'), 'rb') as fh:
+    RAW = base64.b64encode(fh.read()).decode()
+
+REPORT = {"feed.name": "Test Feed",
+          "raw": RAW,
+          "__type": "Report",
+          }
+EVENT = {'__type': 'Event',
+         'classification.type': 'malware',
+         'extra.tags': ['SSH Scanner', 'SSH Worm'],
+         'feed.name': 'Test Feed',
+         'raw': 'eyJpcCI6ICIxNzkuMTI0LjM2LjE5NiIsICJzZWVuIjogdHJ1ZSwgImNsYXNzaWZpY2F0aW9'
+                'uIjogIm1hbGljaW91cyIsICJzcG9vZmFibGUiOiBmYWxzZSwgImZpcnN0X3NlZW4iOiAiMj'
+                'AyMC0wMS0xMyIsICJsYXN0X3NlZW4iOiAiMjAyMC0xMi0xNCIsICJhY3RvciI6ICJ1bmtub'
+                '3duIiwgInRhZ3MiOiBbIlNTSCBTY2FubmVyIiwgIlNTSCBXb3JtIl0sICJjdmUiOiBbXSwg'
+                'Im1ldGFkYXRhIjogeyJjb3VudHJ5IjogIkJyYXppbCIsICJjb3VudHJ5X2NvZGUiOiAiQlI'
+                'iLCAiY2l0eSI6ICJTXHUwMGUzbyBQYXVsbyIsICJvcmdhbml6YXRpb24iOiAiRVFVSU5JWC'
+                'BCUkFTSUwiLCAicmVnaW9uIjogIlNcdTAwZTNvIFBhdWxvIiwgInJkbnMiOiAiMTk2LjM2L'
+                'jEyNC4xNzkuc3RhdGljLnNwMi5hbG9nLmNvbS5iciIsICJhc24iOiAiQVMxNjM5NyIsICJ0'
+                'b3IiOiBmYWxzZSwgIm9zIjogIkxpbnV4IDMuMS0zLjEwIiwgImNhdGVnb3J5IjogImlzcCI'
+                'sICJ2cG4iOiBmYWxzZSwgInZwbl9zZXJ2aWNlIjogIiJ9LCAicmF3X2RhdGEiOiB7InNjYW'
+                '4iOiBbeyJwb3J0IjogMjIsICJwcm90b2NvbCI6ICJUQ1AifSwgeyJwb3J0IjogMjIyMiwgI'
+                'nByb3RvY29sIjogIlRDUCJ9XSwgIndlYiI6IHt9LCAiamEzIjogW119LCAiX190eXBlIjog'
+                'ImRpY3QifQ==',
+         'time.source': '2020-12-14T00:00:00+00:00',
+         'source.ip': '179.124.36.196'
+         }
+
+
+class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase):
+    """
+    A TestCase for a JSONCustomParserBot.
+    """
+
+    @classmethod
+    def set_bot(cls):
+        cls.bot_reference = JSONCustomParserBot
+
+    def test_sample(self):
+        """ Test if correct Event has been produced. """
+        self.input_message = REPORT
+        self.sysconfig = {"json_data_format": True,
+                          "json_data_key": "data",
+                          "type": "malware",
+                          "time_format": "from_format_midnight|%Y-%m-%d",
+                          "translate_fields": {"source.ip": "ip",
+                                               "time.source": "last_seen",
+                                               "extra.tags": "tags"
+                                               }
+                          }
+        self.run_bot()
+        self.assertMessageEqual(0, EVENT)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    unittest.main()
diff --git a/intelmq/tests/bots/parsers/json_custom/test_multiple_msg.py b/intelmq/tests/bots/parsers/json_custom/test_multiple_msg.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+import base64
+import os
+import unittest
+
+import intelmq.lib.test as test
+from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot
+
+with open(os.path.join(os.path.dirname(__file__), 'multiple_msg.json'), 'rb') as fh:
+    RAW1 = base64.b64encode(fh.read()).decode()
+
+MULTILINE_REPORT = {"feed.name": "RSTThreats Domain Feed",
+                    "raw": RAW1,
+                    "__type": "Report",
+                    }
+
+MULTIPLE_EVENT1 = {'__type': 'Event',
+                   'classification.type': 'malware',
+                   'extra.tags': ['spam'],
+                   'extra.threat_info': [],
+                   'feed.name': 'RSTThreats Domain Feed',
+                   'raw': 'eyJkb21haW4iOiAia3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImZzZWVuIjogMTU3NjM2O'
+                          'DAwMCwgImxzZWVuIjogMTYwNzczMTIwMCwgImNvbGxlY3QiOiAxNjA3ODE3NjAwLCAidG'
+                          'FncyI6IHsic3RyIjogWyJzcGFtIl0sICJjb2RlcyI6IFsyXX0sICJyZXNvbHZlZCI6IHs'
+                          'iaXAiOiB7ImEiOiBbIjIzLjYwLjkxLjIyNSIsICIyMy4yMDAuMjM3LjIyNSJdLCAiYWxp'
+                          'YXMiOiBbXSwgImNuYW1lIjogW119LCAid2hvaXMiOiB7ImNyZWF0ZWQiOiAiMTk3MC0wM'
+                          'S0wMSAwMDowMDowMCIsICJ1cGRhdGVkIjogIjE5NzAtMDEtMDEgMDA6MDA6MDAiLCAiZX'
+                          'hwaXJlcyI6ICIxOTcwLTAxLTAxIDAwOjAwOjAwIiwgImFnZSI6IDAsICJyZWdpc3RyYXI'
+                          'iOiAidW5rbm93biIsICJyZWdpc3RyYW50IjogInVua25vd24iLCAiaGF2ZWRhdGEiOiAi'
+                          'ZmFsc2UifX0sICJzY29yZSI6IHsidG90YWwiOiAzLCAic3JjIjogNjAuMiwgInRhZ3MiO'
+                          'iAwLjc1LCAiZnJlcXVlbmN5IjogMC4wN30sICJmcCI6IHsiYWxhcm0iOiAiZmFsc2UiLC'
+                          'AiZGVzY3IiOiAiIn0sICJ0aHJlYXQiOiBbXSwgImlkIjogImQyNjdjNjBmLTU3MDktMzY'
+                          '5OC05NTIzLWY3MjdmNDJhYjVjNyIsICJ0aXRsZSI6ICJSU1QgVGhyZWF0IGZlZWQuIElP'
+                          'Qzoga3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImRlc2NyaXB0aW9uIjogIklPQyB3aXRoI'
+                          'HRhZ3M6IHNwYW0ifQ==',
+                   'source.fqdn': 'kreditohneschufa48.de',
+                   'source.ip': '23.60.91.225',
+                   'time.source': '2020-12-12T00:00:00+00:00'
+                   }
+
+MULTIPLE_EVENT2 = MULTIPLE_EVENT1.copy()
+MULTIPLE_EVENT2["source.ip"] = "23.200.237.225"
+
+
+class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase):
+    """
+    A TestCase for a JSONCustomParserBot.
+    """
+
+    @classmethod
+    def set_bot(cls):
+        cls.bot_reference = JSONCustomParserBot
+
+    def test_multiple_msg(self):
+        """ Test if correct Event has been produced. """
+        self.input_message = MULTILINE_REPORT
+        self.sysconfig = {"splitlines": True,
+                          "type": "malware",
+                          "time_format": "epoch_millis",
+                          "multiple_msg_field": "source.ip",
+                          "translate_fields": {"source.fqdn": "domain",
+                                               "time.source": "lseen",
+                                               "extra.tags": "tags.str",
+                                               "extra.threat_info": "threat",
+                                               "source.ip": "resolved.ip.a"
+                                               }
+                          }
+        self.run_bot()
+        self.assertMessageEqual(0, MULTIPLE_EVENT1)
+        self.assertMessageEqual(1, MULTIPLE_EVENT2)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    unittest.main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"domain": "kreditohneschufa48.de", "fseen": 1576368000, "lseen": 1607731200, "collect": 1607817600, "tags": {"str": ["spam"], "codes": [2]}, "resolved": {"ip": {"a": ["23.60.91.225", "23.200.237.225"], "alias": [], "cname": []}, "whois": {"created": "1970-01-01 00:00:00", "updated": "1970-01-01 00:00:00", "expires": "1970-01-01 00:00:00", "age": 0, "registrar": "unknown", "registrant": "unknown", "havedata": "false"}}, "score": {"total": 3, "src": 60.2, "tags": 0.75, "frequency": 0.07}, "fp": {"alarm": "false", "descr": ""}, "threat": [], "id": "d267c60f-5709-3698-9523-f727f42ab5c7", "title": "RST Threat feed. IOC: kreditohneschufa48.de", "description": "IOC with tags: spam"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"url": "114.234.166.255:39436/mozi.a", "fseen": 1598918400, "lseen": 1601942400, "collect": 1602028800, "tags": {"str": ["malware"], "codes": [10]}, "score": {"total": 10, "src": 73.06, "tags": 0.89, "frequency": 0.58}, "resolved": {"status": 503}, "fp": {"alarm": "true", "descr": "Resource unavailable"}, "threat": [], "id": "987f5038-298f-37eb-a1d5-a17105f6b4b5", "title": "RST Threat feed. IOC: 114.234.166.255:39436/mozi.a", "description": "IOC with tags: malware"}