Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
config.yml
*.pyc
__pycache__
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,18 @@ To get every user and service provider that has used this IdP for authentication
### Previous functionality of `logscan.py`

The subcommand `loop`, which scanned webserver logs for the looping behavior we saw in late 2020, was removed in commit #bf21dda, which left subcommand `sp` as the only operation. It was simplified to remove the IdP version option in commit #0a61bde, and then removed as a subcommand in commit #6beab69. A final round of code cleanup in commit #b8250c8 renamed the script from `logcheck.py` and removed a few more remnants of the old code.



## `loop-checker.py`

This script scans one or more `access.log` files to see details about:
- Which IP/user agent pairs are experiencing login loops
- Which IPs appear with multiple user agent strings


### Filenames

For now, the default file is the one in this repository. More properly, it would default to analyzing the current (live) log file, as it would be named on the idpv4 servers. Use `-f` to specify a different filename.

Multiple filenames are allowed (separate by spaces), wildcards are allowed, and filenames that end in `.gz` can be processed without unzipping them.
64,090 changes: 64,090 additions & 0 deletions access.2021-02-01.log

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions check-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ServicesConfig
)
from pathlib import Path
import socket
import subprocess
import yaml

Expand All @@ -25,8 +26,14 @@ def get_config(args):
config['properties']['idp.home'] = str(config['shibboleth-root'])
if 'metadata-require' not in config:
config['metadata-require'] = ['%{idp.home}/metadata/idp-metadata.xml']
if 'metadata-ignore' not in config:
config['metadata-ignore'] = []
if 'xmllint' not in config:
config['xmllint'] = '/usr/bin/xmllint'
if 'hostname' not in config:
config['hostname'] = socket.gethostbyaddr(socket.gethostname())[0]
if 'test-sp' not in config:
config['test-sp'] = 'test'
return config


Expand Down
53 changes: 53 additions & 0 deletions loop-checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3

from argparse import ArgumentParser
from parsers import WebserverLog

# Load each log to scan
def scan(args):
log = WebserverLog()
for filename in args.filename:
log.load(filename)
scanLog(log)

# Scan a specific log
def scanLog(log):
log.command_loops()

# Create a dictionary to track devices by IP address and user agent
devices_by_ip = {}

for id in log.sequences:
for sequence in log.sequences[id]:
ip_addr = sequence.events[0].ip_addr
user_agent = sequence.events[0].browser # User agent as the identifier

# Combine IP address and user agent to uniquely identify the device
device_identifier = f'{ip_addr}:{user_agent}'

# Check if the IP address is already in the dictionary
if ip_addr in devices_by_ip:
# If the device is not in the list for this IP address, add it
if device_identifier not in devices_by_ip[ip_addr]:
devices_by_ip[ip_addr].append(device_identifier)
if len(devices_by_ip[ip_addr]) > 1:
print(f'Multiple devices detected from {ip_addr}: {", ".join(devices_by_ip[ip_addr])}\n')
else:
# Initialize the list for this IP address with the first device
devices_by_ip[ip_addr] = [device_identifier]

# Continue with loop detection logic using WebserverSequence
loops = sequence.detect_loops()
for timecode, loop_events in loops.items():
print(
f'Loop detected from {ip_addr:15s} {user_agent:40s} {timecode} - {len(loop_events):4d} - {loop_events[0]}\n')

if __name__ == "__main__":
argp = ArgumentParser(
epilog='')
argp.add_argument('-f', '--filename', type=str, nargs='*',
default=['access.2021-02-01.log'],
help='Log filename(s) to process, accepts wildcards')

args = argp.parse_args()
scan(args)
2 changes: 1 addition & 1 deletion parsers/_configfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def make_path(self, text):
def translate_config(self):
path_fields = ['metadata-require', 'metadata-ignore']
for field in path_fields:
if self.config[field]:
if field in self.config:
paths = self.config[field]
if isinstance(paths, str):
paths = [paths]
Expand Down
40 changes: 30 additions & 10 deletions parsers/webserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from urllib.parse import parse_qs
from ._logfile import _LogEvent, _LogSequence, _LogFile

# Could possibly use logger from loguru to replace regex architecture

class WebserverEvent(_LogEvent):
# Inherited methods:
Expand All @@ -23,11 +24,11 @@ class WebserverSequence(_LogSequence):
# last_time(self)
# limit_time(self)

def detect_loops(self, constraints={}):
def detect_loops(self, constraints={}, time_threshold_seconds=5):
# TODO: allow regex patterns as constraints
previous = None
timecode = None
previous_events = {} # Dictionary to store previous events for each IP address
loops = {}

for event in self.events:
# Filter based on constraints
eligible = True
Expand All @@ -36,13 +37,31 @@ def detect_loops(self, constraints={}):
eligible = False
if not eligible:
continue
if str(event) == str(previous):
if timecode not in loops:
loops[timecode] = [previous]
loops[timecode].append(event)
else:
previous = event
timecode = event.time.strftime("%Y-%m-%d %H:%M:%S")

# Exclude specific patterns like "Stale request" errors
if "Stale request" in event.request: # Will have to be adapted to correct Stale request format
continue

# Define an identifier for the event: IP address
identifier = event.ip_addr

# Check if this IP address has seen a previous event
if identifier in previous_events:
prev_event = previous_events[identifier]

# Calculate the time difference between events
time_difference = (event.time - prev_event.time).total_seconds()

# Check if the time difference is within the specified time threshold
if time_difference <= time_threshold_seconds:
# Events are within the time threshold, consider them part of the same loop
if identifier not in loops:
loops[identifier] = []
loops[identifier].append(prev_event)
loops[identifier].append(event)

previous_events[identifier] = event # Update the previous event for this identifier

return loops


Expand Down Expand Up @@ -122,6 +141,7 @@ def command_loops(self):
# 'bytes': 3972,
}
self.find_sequences(index_attr='id')

for id in self.sequences:
for sequence in self.sequences[id]:
ip_addr = sequence.events[0].ip_addr
Expand Down