Skip to content

Commit 0076880

Browse files
committed
json parser: add tests, optimize multipe events mode
add test cases for multiple events mode optimize runtime for multiple events mode add documentation add classification.type = undetermined if input data does not contain the field fix a bug in intelmq.lib.message.Message.from_dict: Do not modify the dict parameter by adding the `__type` field
1 parent f8b4aa0 commit 0076880

File tree

7 files changed

+180
-21
lines changed

7 files changed

+180
-21
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
1919
- Drop support for Python 3.8 (fixes #2616, PR#2617 by Sebastian Wagner).
2020
- `intelmq.lib.splitreports`: Handle bot parameter `chunk_size` values empty string, due to missing parameter typing checks (PR#2604 by Sebastian Wagner).
2121
- `intelmq.lib.mixins.sql` Add Support for MySQL (PR#2625 by Karl-Johan Karlsson).
22+
- Python 3.8 or newer is required (PR#2541 by Sebastian Wagner).
23+
- `intelmq.lib.utils.list_all_bots`/`intelmqctl check`: Fix check for bot executable in $PATH by using the bot name instead of the import path (fixes #2559, PR#2564 by Sebastian Wagner).
24+
- `intelmq.lib.message.Message.from_dict`: Do not modify the dict parameter by adding the `__type` field (PR#2545 by Sebastian Wagner).
2225

2326
### Development
2427

@@ -29,6 +32,9 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
2932

3033
#### Parsers
3134
- `intelmq.bots.parsers.cymru.parser_cap_program`: Add mapping for TOR and ipv6-icmp protocol (PR#2621 by Mikk Margus Möll).
35+
- `intelmq.bots.parser.json.parser`:
36+
- Support data containing lists of JSON Events (PR#2545 by Tim de Boer).
37+
- Add default `classification.type` with value `undetermined` if input data has now classification itself (PR#2545 by Sebastian Wagner).
3238

3339
#### Experts
3440
- `intelmq.bots.experts.asn_lookup.expert`:

docs/user/bots.md

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1923,12 +1923,69 @@ also <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>). Defaults to `htm
19231923

19241924
---
19251925

1926-
### JSON (TODO) <div id="intelmq.bots.parsers.json.parser" />
1926+
### JSON <div id="intelmq.bots.parsers.json.parser" />
19271927

1928-
TODO
1928+
Parses JSON events that are already in IntelMQ format.
1929+
If the input data did not contain the field `classification.type`, it is set to `undetermined`.
1930+
1931+
Supports multiple different modes:
1932+
1933+
#### Input data is one event
1934+
Example:
1935+
```json
1936+
{ INTELMQ data... }
1937+
```
1938+
or:
1939+
```
1940+
{
1941+
INTELMQ data...
1942+
}
1943+
```
1944+
1945+
Configuration:
1946+
* `splitlines`: False
1947+
* `multiple_events`: False
1948+
1949+
#### Input data is in JSON stream format
1950+
Example:
1951+
```json
1952+
{ INTELMQ data... }
1953+
{ INTELMQ data... }
1954+
{ INTELMQ data... }
1955+
```
1956+
1957+
Configuration:
1958+
* `splitlines`: True
1959+
* `multiple_events`: False
1960+
1961+
#### Input data is a list of events
1962+
Example:
1963+
```json
1964+
[
1965+
{ INTELMQ data... },
1966+
{ INTELMQ data... },
1967+
...
1968+
]
1969+
```
1970+
1971+
Configuration:
1972+
* `splitlines`: False
1973+
* `multiple_events`: True
1974+
1975+
#### Configuration
19291976

19301977
**Module:** `intelmq.bots.parsers.json.parser`
19311978

1979+
**Parameters:**
1980+
1981+
**`splitlines`**
1982+
1983+
(optional, boolean) When the input file contains one JSON dictionary per line, set this to `true`. Defaults to `false`.
1984+
1985+
**`multiple_events`**
1986+
1987+
(optional, string) When the input file contains a JSON list of dictionaries, set this to `true`. Defaults to `false`.
1988+
19321989
---
19331990

19341991
### Key=Value Parser <div id="intelmq.bots.parsers.key_value.parser" />

intelmq/bots/parsers/json/parser.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,44 @@
1-
# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik
1+
# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik, 2016-2021 nic.at GmbH, 2024 Tim de Boer, 2025 Institute for Common Good Technology
22
#
33
# SPDX-License-Identifier: AGPL-3.0-or-later
44
"""
55
JSON Parser Bot
66
Retrieves a base64 encoded JSON-String from raw and converts it into an
77
event.
8-
9-
Copyright (C) 2016 by Bundesamt für Sicherheit in der Informationstechnik
10-
Software engineering by Intevation GmbH
118
"""
129
from intelmq.lib.bot import ParserBot
1310
from intelmq.lib.message import MessageFactory
1411
from intelmq.lib.utils import base64_decode
15-
import json
12+
from json import loads as json_loads, dumps as json_dumps
1613

1714

1815
class JSONParserBot(ParserBot):
1916
"""Parse IntelMQ-JSON data"""
20-
splitlines = False
21-
multiple_events = False
17+
splitlines: bool = False
18+
multiple_events: bool = False
2219

2320
def process(self):
2421
report = self.receive_message()
2522
if self.multiple_events:
26-
lines = [json.dumps(event) for event in json.loads(base64_decode(report['raw']))]
23+
lines = json_loads(base64_decode(report["raw"]))
2724
elif self.splitlines:
28-
lines = base64_decode(report['raw']).splitlines()
25+
lines = base64_decode(report["raw"]).splitlines()
2926
else:
30-
lines = [base64_decode(report['raw'])]
27+
lines = [base64_decode(report["raw"])]
3128

3229
for line in lines:
33-
new_event = MessageFactory.unserialize(line,
34-
harmonization=self.harmonization,
35-
default_type='Event')
3630
event = self.new_event(report)
37-
event.update(new_event)
38-
if 'raw' not in event:
39-
event['raw'] = line
31+
if self.multiple_events:
32+
event.update(MessageFactory.from_dict(line,
33+
harmonization=self.harmonization,
34+
default_type="Event"))
35+
event["raw"] = json_dumps(line, sort_keys=True)
36+
else:
37+
event.update(MessageFactory.unserialize(line,
38+
harmonization=self.harmonization,
39+
default_type="Event"))
40+
event.add('raw', line, overwrite=False)
41+
event.add("classification.type", "undetermined", overwrite=False) # set to undetermined if input has no classification
4042
self.send_message(event)
4143
self.acknowledge_message()
4244

intelmq/lib/message.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,17 @@ def from_dict(message: dict, harmonization=None,
4949
MessageFactory.unserialize
5050
MessageFactory.serialize
5151
"""
52-
if default_type and "__type" not in message:
53-
message["__type"] = default_type
5452
try:
55-
class_reference = getattr(intelmq.lib.message, message["__type"])
53+
class_reference = getattr(intelmq.lib.message, message.get("__type", default_type))
5654
except AttributeError:
5755
raise exceptions.InvalidArgument('__type',
5856
got=message["__type"],
5957
expected=VALID_MESSSAGE_TYPES,
6058
docs=HARMONIZATION_CONF_FILE)
6159
# don't modify the parameter
6260
message_copy = message.copy()
61+
if default_type and "__type" not in message_copy:
62+
message_copy["__type"] = default_type
6363
del message_copy["__type"]
6464
return class_reference(message_copy, auto=True, harmonization=harmonization)
6565

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
[
2+
{
3+
"extra.dataset_collections": "0",
4+
"extra.dataset_files": "1",
5+
"extra.dataset_infected": "false",
6+
"extra.dataset_ransom": "null",
7+
"extra.dataset_rows": "0",
8+
"extra.dataset_size": "301",
9+
"protocol.application": "https",
10+
"protocol.transport": "tcp",
11+
"source.asn": 12345689,
12+
"source.fqdn": "fqdn-example-1.tld",
13+
"source.geolocation.cc": "NL",
14+
"source.geolocation.city": "Enschede",
15+
"source.geolocation.latitude": 52.0000000000000,
16+
"source.geolocation.longitude": 6.0000000000000,
17+
"source.geolocation.region": "Overijssel",
18+
"source.ip": "127.1.2.1",
19+
"source.network": "127.1.0.0/16",
20+
"source.port": 80,
21+
"time.source": "2024-12-16T02:08:06+00:00"
22+
},
23+
{
24+
"extra.dataset_collections": "0",
25+
"extra.dataset_files": "1",
26+
"extra.dataset_infected": "false",
27+
"extra.dataset_ransom": "null",
28+
"extra.dataset_rows": "0",
29+
"extra.dataset_size": "615",
30+
"extra.os_name": "Ubuntu",
31+
"extra.software": "Apache",
32+
"extra.tag": "rescan",
33+
"extra.version": "2.4.58",
34+
"protocol.application": "https",
35+
"protocol.transport": "tcp",
36+
"source.asn": 12345689,
37+
"source.fqdn": "fqdn-example-2.tld",
38+
"source.geolocation.cc": "NL",
39+
"source.geolocation.city": "Eindhoven",
40+
"source.geolocation.latitude": 51.0000000000000,
41+
"source.geolocation.longitude": 5.0000000000000,
42+
"source.geolocation.region": "North Brabant",
43+
"source.ip": "127.1.2.2",
44+
"source.network": "127.1.0.0/16",
45+
"source.port": 443,
46+
"time.source": "2024-12-16T02:08:12+00:00"
47+
},
48+
{
49+
"extra.dataset_collections": "0",
50+
"extra.dataset_files": "1",
51+
"extra.dataset_infected": "false",
52+
"extra.dataset_ransom": "null",
53+
"extra.dataset_rows": "0",
54+
"extra.dataset_size": "421",
55+
"protocol.application": "http",
56+
"protocol.transport": "tcp",
57+
"source.asn": 12345689,
58+
"source.geolocation.cc": "NL",
59+
"source.geolocation.city": "Enschede",
60+
"source.geolocation.latitude": 52.0000000000000,
61+
"source.geolocation.longitude": 6.0000000000000,
62+
"source.geolocation.region": "Overijssel",
63+
"source.ip": "127.1.2.3",
64+
"source.network": "127.1.0.0/16",
65+
"source.port": 9000,
66+
"time.source": "2024-12-15T21:09:49+00:00"
67+
}
68+
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SPDX-FileCopyrightText: 2024 Tim de Boer
2+
SPDX-License-Identifier: AGPL-3.0-or-later

intelmq/tests/bots/parsers/json/test_parser.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import base64
77
import os
88
import unittest
9+
from json import loads as json_loads, dumps as json_dumps
910

1011
import intelmq.lib.test as test
1112
from intelmq.bots.parsers.json.parser import JSONParserBot
@@ -51,6 +52,21 @@
5152
NO_DEFAULT_EVENT = MULTILINE_EVENTS[1].copy()
5253
NO_DEFAULT_EVENT['raw'] = base64.b64encode(b'{"source.ip": "127.0.0.2", "classification.type": "c2-server"}\n').decode()
5354

55+
with open(os.path.join(os.path.dirname(__file__), 'ncscnl.json'), 'rb') as fh:
56+
NCSCNL_FILE = fh.read()
57+
NCSCNL_RAW = base64.b64encode(NCSCNL_FILE).decode()
58+
NCSC_EVENTS = json_loads(NCSCNL_FILE)
59+
for i, event in enumerate(NCSC_EVENTS):
60+
NCSC_EVENTS[i]['raw'] = base64.b64encode(json_dumps(event, sort_keys=True).encode()).decode()
61+
NCSC_EVENTS[i]['classification.type'] = 'undetermined'
62+
NCSC_EVENTS[i]['feed.name'] = 'NCSC.NL'
63+
NCSC_EVENTS[i]['__type'] = 'Event'
64+
65+
NCSCNL_REPORT = {"feed.name": "NCSC.NL",
66+
"raw": NCSCNL_RAW,
67+
"__type": "Report",
68+
}
69+
5470

5571
class TestJSONParserBot(test.BotTestCase, unittest.TestCase):
5672
"""
@@ -81,6 +97,14 @@ def test_default_event(self):
8197
self.run_bot()
8298
self.assertMessageEqual(0, NO_DEFAULT_EVENT)
8399

100+
def test_ncscnl(self):
101+
""" A file containing a list of events (not per line) """
102+
self.input_message = NCSCNL_REPORT
103+
self.run_bot(parameters={'multiple_events': True})
104+
self.assertMessageEqual(0, NCSC_EVENTS[0])
105+
self.assertMessageEqual(1, NCSC_EVENTS[1])
106+
self.assertMessageEqual(2, NCSC_EVENTS[2])
107+
84108

85109
if __name__ == '__main__': # pragma: no cover
86110
unittest.main()

0 commit comments

Comments
 (0)