From 315cdaf2ad9c8f0da6a7ac40ba4f52cdeca67446 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 01:37:33 -0500 Subject: [PATCH 01/39] Fix per-domain chart paths --- templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/index.html b/templates/index.html index 98e76ac..91482f7 100644 --- a/templates/index.html +++ b/templates/index.html @@ -166,7 +166,7 @@ container.innerHTML = '

Select a domain

'; return; } - path = 'domains/' + currentDomain + '/' + currentInterval; + path = 'domains/' + encodeURIComponent(currentDomain) + '/' + currentInterval; } fetch(path + '/reports.json') From 97ad5bc998fd5977c833f06fb543e00588590eea Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 01:50:27 -0500 Subject: [PATCH 02/39] Add nginx config parsing utilities --- scripts/nginx_config.py | 88 ++++++++++++++++++++++++++++++++++++++ tests/test_nginx_config.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 scripts/nginx_config.py create mode 100644 tests/test_nginx_config.py diff --git a/scripts/nginx_config.py b/scripts/nginx_config.py new file mode 100644 index 0000000..e2dc447 --- /dev/null +++ b/scripts/nginx_config.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +"""Utilities for discovering and parsing Nginx configuration files. + +This module provides helper functions to locate Nginx configuration files and +extract key details from ``server`` blocks. Typical usage:: + + from scripts.nginx_config import discover_configs, parse_servers + + files = discover_configs() + servers = parse_servers(files) + for s in servers: + print(s.get("server_name"), s.get("listen")) + +The functions intentionally tolerate missing or unreadable files and will simply +skip over them. +""" + +import os +import re +from pathlib import Path +from typing import Dict, List, Set + +DEFAULT_PATHS = [ + "/etc/nginx/nginx.conf", + "/usr/local/etc/nginx/nginx.conf", +] + +INCLUDE_RE = re.compile(r"^\s*include\s+(.*?);", re.MULTILINE) +SERVER_RE = re.compile(r"server\s*{(.*?)}", re.DOTALL) +DIRECTIVE_RE = re.compile(r"^\s*(\S+)\s+(.*?);", re.MULTILINE) + + +def discover_configs() -> Set[Path]: + """Return a set of all config files reachable from :data:`DEFAULT_PATHS`.""" + + found: Set[Path] = set() + queue = [Path(p) for p in DEFAULT_PATHS] + + while queue: + path = queue.pop() + if path in found: + continue + if not path.exists(): + continue + try: + text = path.read_text() + except OSError: + continue + found.add(path) + for pattern in INCLUDE_RE.findall(text): + pattern = os.path.expanduser(pattern.strip()) + for included in path.parent.glob(pattern): + if included.is_file() and included not in found: + queue.append(included) + return found + + +def parse_servers(paths: Set[Path]) -> List[Dict[str, str]]: + """Parse ``server`` blocks from the given files. + + Parameters + ---------- + paths: + Iterable of configuration file paths. + """ + + servers: List[Dict[str, str]] = [] + for p in paths: + try: + text = Path(p).read_text() + except OSError: + continue + for block in SERVER_RE.findall(text): + directives: Dict[str, List[str]] = {} + for name, value in DIRECTIVE_RE.findall(block): + directives.setdefault(name, []).append(value.strip()) + entry: Dict[str, str] = {} + if "server_name" in directives: + entry["server_name"] = " ".join(directives["server_name"]) + if "listen" in directives: + entry["listen"] = " ".join(directives["listen"]) + if "proxy_cache" in directives: + entry["proxy_cache"] = " ".join(directives["proxy_cache"]) + if "root" in directives: + entry["root"] = " ".join(directives["root"]) + servers.append(entry) + return servers + diff --git a/tests/test_nginx_config.py b/tests/test_nginx_config.py new file mode 100644 index 0000000..782c635 --- /dev/null +++ b/tests/test_nginx_config.py @@ -0,0 +1,70 @@ +import sys +from pathlib import Path +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.append(str(REPO_ROOT)) +from scripts import nginx_config as nc + + +def test_discover_configs(tmp_path, monkeypatch): + root = tmp_path / "nginx" + root.mkdir() + conf_d = root / "conf.d" + conf_d.mkdir() + subdir = root / "sub" + subdir.mkdir() + + main = root / "nginx.conf" + site = conf_d / "site.conf" + extra = root / "extra.conf" + nested = subdir / "foo.conf" + + main.write_text("include conf.d/*.conf;\ninclude extra.conf;\n") + site.write_text("# site config\n") + extra.write_text("include sub/foo.conf;\n") + nested.write_text("# nested config\n") + + monkeypatch.setattr(nc, "DEFAULT_PATHS", [str(main)]) + found = nc.discover_configs() + + assert found == {main, site, extra, nested} + + +def test_parse_servers(tmp_path): + conf1 = tmp_path / "site.conf" + conf2 = tmp_path / "other.conf" + + conf1.write_text( + """ +server { + listen 80; + server_name example.com; + root /srv/example; + proxy_cache cache1; +} +""" + ) + + conf2.write_text( + """ +server { + listen 443 ssl; + server_name example.org; +} +""" + ) + + servers = nc.parse_servers({conf1, conf2, tmp_path / "missing.conf"}) + servers = sorted(servers, key=lambda s: s.get("server_name")) + + assert len(servers) == 2 + assert servers[0]["server_name"] == "example.com" + assert servers[0]["listen"] == "80" + assert servers[0]["root"] == "/srv/example" + assert servers[0]["proxy_cache"] == "cache1" + + assert servers[1]["server_name"] == "example.org" + assert servers[1]["listen"] == "443 ssl" + assert "proxy_cache" not in servers[1] + From 37fc8985564bd50170d96deb41a7f0ff15cf42d5 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 01:53:36 -0500 Subject: [PATCH 03/39] Add analyze utility for DB queries --- scripts/analyze.py | 114 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 scripts/analyze.py diff --git a/scripts/analyze.py b/scripts/analyze.py new file mode 100644 index 0000000..528af37 --- /dev/null +++ b/scripts/analyze.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Utility helpers for ad-hoc log analysis. + +This module exposes small helper functions to inspect the ``ngxstat`` SQLite +database. The intent is to allow quick queries from the command line or other +scripts without rewriting SQL each time. + +Examples +-------- +To list all domains present in the database:: + + python scripts/analyze.py domains + +The CLI is powered by :mod:`typer` and currently only offers a couple of +commands. More analysis routines can be added over time. +""" +from __future__ import annotations + +import sqlite3 +from pathlib import Path +from typing import Dict, List, Optional + +import typer + +from scripts import nginx_config # noqa: F401 # imported for side effects/usage + +DB_PATH = Path("database/ngxstat.db") + +app = typer.Typer(help="Ad-hoc statistics queries") + + +def _connect() -> sqlite3.Connection: + """Return a new SQLite connection to :data:`DB_PATH`.""" + return sqlite3.connect(DB_PATH) + + +def load_domains_from_db() -> List[str]: + """Return a sorted list of distinct domains from the ``logs`` table.""" + conn = _connect() + cur = conn.cursor() + cur.execute("SELECT DISTINCT host FROM logs ORDER BY host") + domains = [row[0] for row in cur.fetchall()] + conn.close() + return domains + + +def get_hit_count(domain: Optional[str] = None) -> int: + """Return total request count. + + Parameters + ---------- + domain: + Optional domain to filter on. If ``None`` the count includes all logs. + """ + conn = _connect() + cur = conn.cursor() + if domain: + cur.execute("SELECT COUNT(*) FROM logs WHERE host = ?", (domain,)) + else: + cur.execute("SELECT COUNT(*) FROM logs") + count = cur.fetchone()[0] or 0 + conn.close() + return count + + +def get_cache_ratio(domain: Optional[str] = None) -> float: + """Return the percentage of requests served from cache.""" + conn = _connect() + cur = conn.cursor() + if domain: + cur.execute( + "SELECT SUM(CASE WHEN cache_status = 'HIT' THEN 1 ELSE 0 END) * 1.0 / " + "COUNT(*) FROM logs WHERE host = ?", + (domain,), + ) + else: + cur.execute( + "SELECT SUM(CASE WHEN cache_status = 'HIT' THEN 1 ELSE 0 END) * 1.0 / " + "COUNT(*) FROM logs" + ) + result = cur.fetchone()[0] + conn.close() + return float(result or 0.0) + + +@app.command() +def domains() -> None: + """Print the list of domains discovered in the database.""" + for d in load_domains_from_db(): + typer.echo(d) + + +@app.command() +def hits(domain: Optional[str] = typer.Option(None, help="Filter by domain")) -> None: + """Show request count.""" + count = get_hit_count(domain) + if domain: + typer.echo(f"{domain}: {count} hits") + else: + typer.echo(f"Total hits: {count}") + + +@app.command("cache-ratio") +def cache_ratio_cmd(domain: Optional[str] = typer.Option(None, help="Filter by domain")) -> None: + """Display cache hit ratio as a percentage.""" + ratio = get_cache_ratio(domain) * 100 + if domain: + typer.echo(f"{domain}: {ratio:.2f}% cached") + else: + typer.echo(f"Cache hit ratio: {ratio:.2f}%") + + +if __name__ == "__main__": + app() From 1a6e83663110c6e92129795716b04275f3f86089 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:00:37 -0500 Subject: [PATCH 04/39] Add check_missing_domains command and tests --- scripts/analyze.py | 35 ++++++++++++++++- tests/test_analyze.py | 90 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tests/test_analyze.py diff --git a/scripts/analyze.py b/scripts/analyze.py index 528af37..14634e0 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -18,7 +18,9 @@ from __future__ import annotations import sqlite3 from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Set + +import json import typer @@ -110,5 +112,36 @@ def cache_ratio_cmd(domain: Optional[str] = typer.Option(None, help="Filter by d typer.echo(f"Cache hit ratio: {ratio:.2f}%") +@app.command("check-missing-domains") +def check_missing_domains(json_output: bool = typer.Option(False, "--json", help="Output missing domains as JSON")) -> None: + """Show domains present in the database but absent from Nginx config.""" + try: + from scripts.generate_reports import _get_domains as _db_domains + except Exception: # pragma: no cover - fallback if import fails + _db_domains = load_domains_from_db + + if not isinstance(json_output, bool): + json_output = False + + db_domains = set(_db_domains()) + + paths = nginx_config.discover_configs() + servers = nginx_config.parse_servers(paths) + config_domains: Set[str] = set() + for server in servers: + names = server.get("server_name", "") + for name in names.split(): + if name: + config_domains.add(name) + + missing = sorted(db_domains - config_domains) + + if json_output: + typer.echo(json.dumps(missing)) + else: + for d in missing: + typer.echo(d) + + if __name__ == "__main__": app() diff --git a/tests/test_analyze.py b/tests/test_analyze.py new file mode 100644 index 0000000..5f2b4d9 --- /dev/null +++ b/tests/test_analyze.py @@ -0,0 +1,90 @@ +import sys +import json +import sqlite3 +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.append(str(REPO_ROOT)) +from scripts import analyze +from scripts import generate_reports as gr + + +def setup_db(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(path) + cur = conn.cursor() + cur.execute( + """ + CREATE TABLE logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + ) + """ + ) + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "127.0.0.1", + "example.com", + "2024-01-01 10:00:00", + "GET / HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "127.0.0.1", + "missing.com", + "2024-01-01 11:00:00", + "GET / HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + conn.commit() + conn.close() + + +def test_check_missing_domains(tmp_path, monkeypatch, capsys): + db_path = tmp_path / "database" / "ngxstat.db" + setup_db(db_path) + + conf = tmp_path / "nginx.conf" + conf.write_text( + """ +server { + listen 80; + server_name example.com; +} +""" + ) + + monkeypatch.setattr(analyze, "DB_PATH", db_path) + monkeypatch.setattr(gr, "DB_PATH", db_path) + monkeypatch.setattr(analyze.nginx_config, "DEFAULT_PATHS", [str(conf)]) + + analyze.check_missing_domains(json_output=False) + out = capsys.readouterr().out.strip().splitlines() + assert out == ["missing.com"] + + analyze.check_missing_domains(json_output=True) + out_json = json.loads(capsys.readouterr().out.strip()) + assert out_json == ["missing.com"] From 7f996fe123f50a71b975d244bad0f0e50fcbcc22 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:07:23 -0500 Subject: [PATCH 05/39] Add suggest_cache command and tests --- scripts/analyze.py | 54 ++++++++++++++++++++ tests/test_analyze.py | 115 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/scripts/analyze.py b/scripts/analyze.py index 14634e0..ded224d 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -143,5 +143,59 @@ def check_missing_domains(json_output: bool = typer.Option(False, "--json", help typer.echo(d) +@app.command("suggest-cache") +def suggest_cache( + threshold: int = typer.Option( + 10, help="Minimum number of MISS entries to report" + ), + json_output: bool = typer.Option(False, "--json", help="Output results as JSON"), +) -> None: + """Suggest domain/path pairs that could benefit from caching. + + Paths with at least ``threshold`` ``MISS`` entries are shown for domains + whose server blocks lack a ``proxy_cache`` directive. + """ + + # Discover domains without explicit proxy_cache + paths = nginx_config.discover_configs() + servers = nginx_config.parse_servers(paths) + no_cache: Set[str] = set() + for server in servers: + if "proxy_cache" in server: + continue + for name in server.get("server_name", "").split(): + if name: + no_cache.add(name) + + conn = _connect() + cur = conn.cursor() + cur.execute( + """ + SELECT host, + substr(request, instr(request, ' ')+1, + instr(request, ' HTTP') - instr(request, ' ') - 1) AS path, + COUNT(*) AS miss_count + FROM logs + WHERE cache_status = 'MISS' + GROUP BY host, path + HAVING miss_count >= ? + ORDER BY miss_count DESC + """, + (threshold,), + ) + + rows = [r for r in cur.fetchall() if r[0] in no_cache] + conn.close() + + if json_output: + result = [ + {"host": host, "path": path, "misses": count} for host, path, count in rows + ] + typer.echo(json.dumps(result)) + else: + for host, path, count in rows: + typer.echo(f"{host} {path} {count}") + + if __name__ == "__main__": app() diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 5f2b4d9..40bdc40 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -88,3 +88,118 @@ server { analyze.check_missing_domains(json_output=True) out_json = json.loads(capsys.readouterr().out.strip()) assert out_json == ["missing.com"] + + +def test_suggest_cache(tmp_path, monkeypatch, capsys): + db_path = tmp_path / "database" / "ngxstat.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + cur = conn.cursor() + cur.execute( + """ + CREATE TABLE logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + ) + """ + ) + entries = [ + ( + "127.0.0.1", + "example.com", + "2024-01-01 10:00:00", + "GET /foo HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ( + "127.0.0.1", + "example.com", + "2024-01-01 10:01:00", + "GET /foo HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ( + "127.0.0.1", + "example.com", + "2024-01-01 10:02:00", + "GET /foo HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ( + "127.0.0.1", + "cached.com", + "2024-01-01 10:00:00", + "GET /bar HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ( + "127.0.0.1", + "cached.com", + "2024-01-01 10:01:00", + "GET /bar HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ] + cur.executemany( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + entries, + ) + conn.commit() + conn.close() + + conf = tmp_path / "nginx.conf" + conf.write_text( + """ +server { + listen 80; + server_name example.com; +} + +server { + listen 80; + server_name cached.com; + proxy_cache cache1; +} +""" + ) + + monkeypatch.setattr(analyze, "DB_PATH", db_path) + monkeypatch.setattr(gr, "DB_PATH", db_path) + monkeypatch.setattr(analyze.nginx_config, "DEFAULT_PATHS", [str(conf)]) + + analyze.suggest_cache(threshold=2, json_output=False) + out = capsys.readouterr().out.strip().splitlines() + assert out == ["example.com /foo 3"] + + analyze.suggest_cache(threshold=2, json_output=True) + out_json = json.loads(capsys.readouterr().out.strip()) + assert out_json == [{"host": "example.com", "path": "/foo", "misses": 3}] From 350445b167d4ff3fb413eaf8b79d815750ae07fc Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:12:24 -0500 Subject: [PATCH 06/39] Add threat detection analysis --- scripts/analyze.py | 134 ++++++++++++++++++++++++++++++++++++++++++ tests/test_analyze.py | 119 +++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) diff --git a/scripts/analyze.py b/scripts/analyze.py index ded224d..219ceeb 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -19,6 +19,7 @@ from __future__ import annotations import sqlite3 from pathlib import Path from typing import Dict, List, Optional, Set +from datetime import datetime, timedelta import json @@ -27,6 +28,7 @@ import typer from scripts import nginx_config # noqa: F401 # imported for side effects/usage DB_PATH = Path("database/ngxstat.db") +ANALYSIS_DIR = Path("output/analysis") app = typer.Typer(help="Ad-hoc statistics queries") @@ -197,5 +199,137 @@ def suggest_cache( typer.echo(f"{host} {path} {count}") +@app.command("detect-threats") +def detect_threats( + hours: int = typer.Option(1, help="Number of recent hours to analyze"), + ip_threshold: int = typer.Option( + 100, help="Requests from a single IP to flag" + ), +) -> None: + """Detect potential security threats from recent logs.""" + + conn = _connect() + cur = conn.cursor() + + cur.execute("SELECT MAX(time) FROM logs") + row = cur.fetchone() + if not row or not row[0]: + typer.echo("No logs found") + conn.close() + return + + max_dt = datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") + recent_end = max_dt + recent_start = recent_end - timedelta(hours=hours) + prev_start = recent_start - timedelta(hours=hours) + prev_end = recent_start + + fmt = "%Y-%m-%d %H:%M:%S" + recent_start_s = recent_start.strftime(fmt) + recent_end_s = recent_end.strftime(fmt) + prev_start_s = prev_start.strftime(fmt) + prev_end_s = prev_end.strftime(fmt) + + cur.execute( + """ + SELECT host, + SUM(CASE WHEN status >= 400 THEN 1 ELSE 0 END) AS errors, + COUNT(*) AS total + FROM logs + WHERE time >= ? AND time < ? + GROUP BY host + """, + (recent_start_s, recent_end_s), + ) + recent_rows = {r[0]: (r[1], r[2]) for r in cur.fetchall()} + + cur.execute( + """ + SELECT host, + SUM(CASE WHEN status >= 400 THEN 1 ELSE 0 END) AS errors, + COUNT(*) AS total + FROM logs + WHERE time >= ? AND time < ? + GROUP BY host + """, + (prev_start_s, prev_end_s), + ) + prev_rows = {r[0]: (r[1], r[2]) for r in cur.fetchall()} + + error_spikes = [] + for host in set(recent_rows) | set(prev_rows): + r_err, r_total = recent_rows.get(host, (0, 0)) + p_err, p_total = prev_rows.get(host, (0, 0)) + r_rate = r_err * 100.0 / r_total if r_total else 0.0 + p_rate = p_err * 100.0 / p_total if p_total else 0.0 + if r_rate >= 10 and r_rate >= p_rate * 2: + error_spikes.append( + { + "host": host, + "recent_error_rate": round(r_rate, 2), + "previous_error_rate": round(p_rate, 2), + } + ) + + cur.execute( + """ + SELECT DISTINCT user_agent FROM logs + WHERE time >= ? AND time < ? + """, + (prev_start_s, prev_end_s), + ) + prev_agents = {r[0] for r in cur.fetchall()} + + cur.execute( + """ + SELECT user_agent, COUNT(*) AS c + FROM logs + WHERE time >= ? AND time < ? + GROUP BY user_agent + HAVING c >= 10 + """, + (recent_start_s, recent_end_s), + ) + suspicious_agents = [ + {"user_agent": ua, "requests": cnt} + for ua, cnt in cur.fetchall() + if ua not in prev_agents + ] + + cur.execute( + """ + SELECT ip, COUNT(*) AS c + FROM logs + WHERE time >= ? AND time < ? + GROUP BY ip + HAVING c >= ? + ORDER BY c DESC + """, + (recent_start_s, recent_end_s, ip_threshold), + ) + high_ip_requests = [ + {"ip": ip, "requests": cnt} for ip, cnt in cur.fetchall() + ] + + conn.close() + + report = { + "time_range": { + "recent_start": recent_start_s, + "recent_end": recent_end_s, + "previous_start": prev_start_s, + "previous_end": prev_end_s, + }, + "error_spikes": error_spikes, + "suspicious_agents": suspicious_agents, + "high_ip_requests": high_ip_requests, + } + + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "threat_report.json" + out_path.write_text(json.dumps(report, indent=2)) + typer.echo(json.dumps(report)) + + if __name__ == "__main__": app() diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 40bdc40..a4358d7 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -203,3 +203,122 @@ server { analyze.suggest_cache(threshold=2, json_output=True) out_json = json.loads(capsys.readouterr().out.strip()) assert out_json == [{"host": "example.com", "path": "/foo", "misses": 3}] + + +def setup_threat_db(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(path) + cur = conn.cursor() + cur.execute( + """ + CREATE TABLE logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + ) + """ + ) + + # Previous hour traffic with no errors + for i in range(10): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "2.2.2.2", + "example.com", + f"2024-01-01 11:{i:02d}:00", + "GET /ok HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + + # Recent hour with errors + for i in range(10): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "3.3.3.3", + "example.com", + f"2024-01-01 12:{i:02d}:00", + "GET /fail HTTP/1.1", + 500, + 100, + "-", + "curl", + "MISS", + ), + ) + + # High traffic from single IP + for i in range(101): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "1.1.1.1", + "example.net", + f"2024-01-01 12:{i % 10:02d}:30", + "GET /spam HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + + # New suspicious user agent + for i in range(15): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "4.4.4.4", + "example.org", + f"2024-01-01 12:{30 + i:02d}:45", + "GET /bot HTTP/1.1", + 200, + 100, + "-", + "newbot", + "MISS", + ), + ) + + conn.commit() + conn.close() + + +def test_detect_threats(tmp_path, monkeypatch): + db_path = tmp_path / "database" / "ngxstat.db" + setup_threat_db(db_path) + + out_dir = tmp_path / "analysis" + monkeypatch.setattr(analyze, "DB_PATH", db_path) + monkeypatch.setattr(analyze, "ANALYSIS_DIR", out_dir) + + analyze.detect_threats(hours=1, ip_threshold=100) + + report = json.loads((out_dir / "threat_report.json").read_text()) + + hosts = {e["host"] for e in report.get("error_spikes", [])} + assert "example.com" in hosts + + ips = {e["ip"] for e in report.get("high_ip_requests", [])} + assert "1.1.1.1" in ips + + agents = {e["user_agent"] for e in report.get("suspicious_agents", [])} + assert "newbot" in agents From 2e7e75e4ce8dfdb60506ea523d5c324a8d806069 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:19:08 -0500 Subject: [PATCH 07/39] Add run-analysis helper script and tests --- README.md | 11 +++++++++++ run-analysis.sh | 34 ++++++++++++++++++++++++++++++++++ tests/test_run_analysis.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100755 run-analysis.sh create mode 100644 tests/test_run_analysis.py diff --git a/README.md b/README.md index d206658..70f21d9 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,17 @@ Use the `run-reports.sh` script to run all report intervals in one step. The scr Running this script will create or update the hourly, daily, weekly and monthly reports under `output/`. It also detects all unique domains found in the database and writes per-domain reports to `output/domains//` alongside the aggregate data. After generation, open `output/index.html` in your browser to browse the reports. + +## Log Analysis + +The `run-analysis.sh` script runs helper routines that inspect the database. It +creates or reuses the virtual environment and then executes a set of analysis +commands to spot missing domains, suggest cache rules and detect potential +threats. + +```bash +./run-analysis.sh +``` ## Serving Reports with Nginx To expose the generated HTML dashboards and JSON files over HTTP you can use a diff --git a/run-analysis.sh b/run-analysis.sh new file mode 100755 index 0000000..1f5c213 --- /dev/null +++ b/run-analysis.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -e + +# Ensure virtual environment exists +if [ ! -d ".venv" ]; then + echo "[INFO] Creating virtual environment..." + python3 -m venv .venv + source .venv/bin/activate + echo "[INFO] Installing dependencies..." + pip install --upgrade pip + if [ -f requirements.txt ]; then + pip install -r requirements.txt + else + echo "[WARN] requirements.txt not found, skipping." + fi +else + echo "[INFO] Activating virtual environment..." + source .venv/bin/activate +fi + +# Run analysis helpers +echo "[INFO] Checking for missing domains..." +python scripts/analyze.py check-missing-domains + +echo "[INFO] Suggesting cache improvements..." +python scripts/analyze.py suggest-cache + +echo "[INFO] Detecting threats..." +python scripts/analyze.py detect-threats + +# Deactivate to keep cron environment clean +if type deactivate >/dev/null 2>&1; then + deactivate +fi diff --git a/tests/test_run_analysis.py b/tests/test_run_analysis.py new file mode 100644 index 0000000..60d2810 --- /dev/null +++ b/tests/test_run_analysis.py @@ -0,0 +1,32 @@ +import os +import subprocess +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPT = REPO_ROOT / "run-analysis.sh" + + +def test_script_invokes_commands(tmp_path): + # create stub virtualenv so the script skips creation + venv = tmp_path / ".venv" / "bin" + venv.mkdir(parents=True) + (venv / "activate").write_text(":") + + calls = tmp_path / "calls.txt" + python_stub = tmp_path / "python" + python_stub.write_text(f"#!/usr/bin/env bash\necho \"$*\" >> \"{calls}\"\n") + python_stub.chmod(0o755) + (tmp_path / "python3").write_text(f"#!/usr/bin/env bash\nexit 0\n") + (tmp_path / "python3").chmod(0o755) + + env = os.environ.copy() + env["PATH"] = f"{tmp_path}:" + env.get("PATH", "") + + subprocess.run([str(SCRIPT)], cwd=tmp_path, env=env, check=True) + + lines = calls.read_text().strip().splitlines() + assert lines == [ + "scripts/analyze.py check-missing-domains", + "scripts/analyze.py suggest-cache", + "scripts/analyze.py detect-threats", + ] From 9cf27ecb2f917b5a2ac91ded9c8b50a548ef2ba3 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:30:15 -0500 Subject: [PATCH 08/39] Add analysis tab and JSON outputs --- README.md | 2 + scripts/analyze.py | 19 +++++-- templates/index.html | 131 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 130 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 70f21d9..acb1055 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ threats. ```bash ./run-analysis.sh ``` +The JSON results are written under `output/analysis` and can be viewed from the +"Analysis" tab in the generated dashboard. ## Serving Reports with Nginx To expose the generated HTML dashboards and JSON files over HTTP you can use a diff --git a/scripts/analyze.py b/scripts/analyze.py index 219ceeb..8ac7c30 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -138,6 +138,10 @@ def check_missing_domains(json_output: bool = typer.Option(False, "--json", help missing = sorted(db_domains - config_domains) + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "missing_domains.json" + out_path.write_text(json.dumps(missing, indent=2)) + if json_output: typer.echo(json.dumps(missing)) else: @@ -189,14 +193,19 @@ def suggest_cache( rows = [r for r in cur.fetchall() if r[0] in no_cache] conn.close() + result = [ + {"host": host, "path": path, "misses": count} for host, path, count in rows + ] + + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "cache_suggestions.json" + out_path.write_text(json.dumps(result, indent=2)) + if json_output: - result = [ - {"host": host, "path": path, "misses": count} for host, path, count in rows - ] typer.echo(json.dumps(result)) else: - for host, path, count in rows: - typer.echo(f"{host} {path} {count}") + for item in result: + typer.echo(f"{item['host']} {item['path']} {item['misses']}") @app.command("detect-threats") diff --git a/templates/index.html b/templates/index.html index 91482f7..7b0b98f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -15,6 +15,7 @@
  • Overview
  • All Domains
  • Per Domain
  • +
  • Analysis
  • @@ -56,9 +57,15 @@
    - + + + @@ -73,13 +80,19 @@ const sections = { overview: document.getElementById('overview-section'), all: document.getElementById('all-section'), - domain: document.getElementById('domain-section') + domain: document.getElementById('domain-section'), + analysis: document.getElementById('analysis-section') }; const containers = { overview: document.getElementById('overview-reports'), all: document.getElementById('reports-all'), domain: document.getElementById('reports-domain') }; + const analysisElems = { + missing: document.getElementById('analysis-missing'), + cache: document.getElementById('analysis-cache'), + threats: document.getElementById('analysis-threats') + }; const totalElem = document.getElementById('stat-total'); const startElem = document.getElementById('stat-start'); const endElem = document.getElementById('stat-end'); @@ -169,19 +182,99 @@ path = 'domains/' + encodeURIComponent(currentDomain) + '/' + currentInterval; } - fetch(path + '/reports.json') - .then(r => r.json()) - .then(reports => { - container.innerHTML = ''; - reports.forEach(rep => { - fetch(path + '/' + rep.html) - .then(r => r.text()) - .then(html => { - container.insertAdjacentHTML('beforeend', html); - initReport(rep, path); - }); + fetch(path + '/reports.json') + .then(r => r.json()) + .then(reports => { + container.innerHTML = ''; + reports.forEach(rep => { + fetch(path + '/' + rep.html) + .then(r => r.text()) + .then(html => { + container.insertAdjacentHTML('beforeend', html); + initReport(rep, path); + }); + }); + feather.replace(); }); - feather.replace(); + } + + function loadAnalysis() { + analysisElems.missing.innerHTML = '

    Missing Domains

    '; + analysisElems.cache.innerHTML = '

    Cache Suggestions

    '; + analysisElems.threats.innerHTML = '

    Threat Report

    '; + + fetch('analysis/missing_domains.json') + .then(r => r.json()) + .then(list => { + if (list.length === 0) { + analysisElems.missing.insertAdjacentHTML('beforeend', '

    None

    '); + return; + } + const items = list.map(d => `
  • ${d}
  • `).join(''); + analysisElems.missing.insertAdjacentHTML('beforeend', `
      ${items}
    `); + }); + + fetch('analysis/cache_suggestions.json') + .then(r => r.json()) + .then(data => { + if (data.length === 0) { + analysisElems.cache.insertAdjacentHTML('beforeend', '

    No suggestions

    '); + return; + } + analysisElems.cache.insertAdjacentHTML('beforeend', '
    '); + const rows = data.map(x => [x.host, x.path, x.misses]); + new DataTable('#table-cache', { + data: rows, + columns: [ + { title: 'Domain' }, + { title: 'Path' }, + { title: 'Misses' } + ] + }); + }); + + fetch('analysis/threat_report.json') + .then(r => r.json()) + .then(rep => { + const hasData = rep.error_spikes?.length || rep.suspicious_agents?.length || rep.high_ip_requests?.length; + if (!hasData) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    No threats detected

    '); + return; + } + if (rep.error_spikes && rep.error_spikes.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    Error Spikes

    '); + const rows = rep.error_spikes.map(x => [x.host, x.recent_error_rate, x.previous_error_rate]); + new DataTable('#table-errors', { + data: rows, + columns: [ + { title: 'Domain' }, + { title: 'Recent %' }, + { title: 'Previous %' } + ] + }); + } + if (rep.suspicious_agents && rep.suspicious_agents.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    Suspicious User Agents

    '); + const rows = rep.suspicious_agents.map(x => [x.user_agent, x.requests]); + new DataTable('#table-agents', { + data: rows, + columns: [ + { title: 'User Agent' }, + { title: 'Requests' } + ] + }); + } + if (rep.high_ip_requests && rep.high_ip_requests.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    High IP Requests

    '); + const rows = rep.high_ip_requests.map(x => [x.ip, x.requests]); + new DataTable('#table-ips', { + data: rows, + columns: [ + { title: 'IP' }, + { title: 'Requests' } + ] + }); + } }); } @@ -198,7 +291,11 @@ if (name === 'overview') { loadStats(); } - loadReports(); + if (name === 'analysis') { + loadAnalysis(); + } else { + loadReports(); + } } intervalSelect.addEventListener('change', () => { From 0a05f0c010d791c8fcf9f33a9787f224bbac7ae3 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:55:15 -0500 Subject: [PATCH 09/39] Fix module imports in run-analysis --- run-analysis.sh | 6 +++--- scripts/__init__.py | 1 + tests/test_run_analysis.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 scripts/__init__.py diff --git a/run-analysis.sh b/run-analysis.sh index 1f5c213..e487224 100755 --- a/run-analysis.sh +++ b/run-analysis.sh @@ -20,13 +20,13 @@ fi # Run analysis helpers echo "[INFO] Checking for missing domains..." -python scripts/analyze.py check-missing-domains +python -m scripts.analyze check-missing-domains echo "[INFO] Suggesting cache improvements..." -python scripts/analyze.py suggest-cache +python -m scripts.analyze suggest-cache echo "[INFO] Detecting threats..." -python scripts/analyze.py detect-threats +python -m scripts.analyze detect-threats # Deactivate to keep cron environment clean if type deactivate >/dev/null 2>&1; then diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..f4c57a1 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"Utility package for ngxstat scripts" diff --git a/tests/test_run_analysis.py b/tests/test_run_analysis.py index 60d2810..66d00df 100644 --- a/tests/test_run_analysis.py +++ b/tests/test_run_analysis.py @@ -26,7 +26,7 @@ def test_script_invokes_commands(tmp_path): lines = calls.read_text().strip().splitlines() assert lines == [ - "scripts/analyze.py check-missing-domains", - "scripts/analyze.py suggest-cache", - "scripts/analyze.py detect-threats", + "-m scripts.analyze check-missing-domains", + "-m scripts.analyze suggest-cache", + "-m scripts.analyze detect-threats", ] From 2443aecaf65c4bbd8b66a4cf7ee34dc8ece69ab7 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 03:17:07 -0500 Subject: [PATCH 10/39] fix nginx config discovery for absolute include patterns --- scripts/nginx_config.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/nginx_config.py b/scripts/nginx_config.py index e2dc447..dbd635d 100644 --- a/scripts/nginx_config.py +++ b/scripts/nginx_config.py @@ -49,7 +49,15 @@ def discover_configs() -> Set[Path]: found.add(path) for pattern in INCLUDE_RE.findall(text): pattern = os.path.expanduser(pattern.strip()) - for included in path.parent.glob(pattern): + if os.path.isabs(pattern): + # ``Path.glob`` does not allow absolute patterns, so we + # anchor at the filesystem root and remove the leading + # separator. + base = Path(os.sep) + glob_iter = base.glob(pattern.lstrip(os.sep)) + else: + glob_iter = path.parent.glob(pattern) + for included in glob_iter: if included.is_file() and included not in found: queue.append(included) return found From d1f3c5a9ae0fa66e56828743c7103eb2f967753d Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 03:30:08 -0500 Subject: [PATCH 11/39] Hide analysis duration --- scripts/generate_reports.py | 2 +- templates/index.html | 2 +- tests/test_reports.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/generate_reports.py b/scripts/generate_reports.py index ae7a3a7..4b5692a 100644 --- a/scripts/generate_reports.py +++ b/scripts/generate_reports.py @@ -175,7 +175,7 @@ def _generate_root_index() -> None: intervals = [ p.name for p in OUTPUT_DIR.iterdir() - if p.is_dir() and p.name.lower() not in {"domains", "global"} + if p.is_dir() and p.name.lower() not in {"domains", "global", "analysis"} ] intervals.sort() diff --git a/templates/index.html b/templates/index.html index 7b0b98f..0a8abed 100644 --- a/templates/index.html +++ b/templates/index.html @@ -286,7 +286,7 @@ Object.entries(sections).forEach(([key, section]) => { section.classList.toggle('is-hidden', key !== name); }); - intervalControl.classList.toggle('is-hidden', name === 'overview'); + intervalControl.classList.toggle('is-hidden', name === 'overview' || name === 'analysis'); domainControl.classList.toggle('is-hidden', name !== 'domain'); if (name === 'overview') { loadStats(); diff --git a/tests/test_reports.py b/tests/test_reports.py index fec898c..dbe71c2 100644 --- a/tests/test_reports.py +++ b/tests/test_reports.py @@ -184,6 +184,8 @@ def test_generate_root_index(tmp_path, sample_reports, monkeypatch): (tmp_path / "output" / "domains" / "bar.com").mkdir(parents=True) # add an extra directory with capitalized name to ensure it's ignored (tmp_path / "output" / "Global").mkdir(parents=True) + # add an analysis directory to ensure it's excluded + (tmp_path / "output" / "analysis").mkdir(parents=True) gr._generate_root_index() @@ -196,6 +198,7 @@ def test_generate_root_index(tmp_path, sample_reports, monkeypatch): assert '