From 350445b167d4ff3fb413eaf8b79d815750ae07fc Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:12:24 -0500 Subject: [PATCH 01/34] Add threat detection analysis --- scripts/analyze.py | 134 ++++++++++++++++++++++++++++++++++++++++++ tests/test_analyze.py | 119 +++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) diff --git a/scripts/analyze.py b/scripts/analyze.py index ded224d..219ceeb 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -19,6 +19,7 @@ from __future__ import annotations import sqlite3 from pathlib import Path from typing import Dict, List, Optional, Set +from datetime import datetime, timedelta import json @@ -27,6 +28,7 @@ import typer from scripts import nginx_config # noqa: F401 # imported for side effects/usage DB_PATH = Path("database/ngxstat.db") +ANALYSIS_DIR = Path("output/analysis") app = typer.Typer(help="Ad-hoc statistics queries") @@ -197,5 +199,137 @@ def suggest_cache( typer.echo(f"{host} {path} {count}") +@app.command("detect-threats") +def detect_threats( + hours: int = typer.Option(1, help="Number of recent hours to analyze"), + ip_threshold: int = typer.Option( + 100, help="Requests from a single IP to flag" + ), +) -> None: + """Detect potential security threats from recent logs.""" + + conn = _connect() + cur = conn.cursor() + + cur.execute("SELECT MAX(time) FROM logs") + row = cur.fetchone() + if not row or not row[0]: + typer.echo("No logs found") + conn.close() + return + + max_dt = datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") + recent_end = max_dt + recent_start = recent_end - timedelta(hours=hours) + prev_start = recent_start - timedelta(hours=hours) + prev_end = recent_start + + fmt = "%Y-%m-%d %H:%M:%S" + recent_start_s = recent_start.strftime(fmt) + recent_end_s = recent_end.strftime(fmt) + prev_start_s = prev_start.strftime(fmt) + prev_end_s = prev_end.strftime(fmt) + + cur.execute( + """ + SELECT host, + SUM(CASE WHEN status >= 400 THEN 1 ELSE 0 END) AS errors, + COUNT(*) AS total + FROM logs + WHERE time >= ? AND time < ? + GROUP BY host + """, + (recent_start_s, recent_end_s), + ) + recent_rows = {r[0]: (r[1], r[2]) for r in cur.fetchall()} + + cur.execute( + """ + SELECT host, + SUM(CASE WHEN status >= 400 THEN 1 ELSE 0 END) AS errors, + COUNT(*) AS total + FROM logs + WHERE time >= ? AND time < ? + GROUP BY host + """, + (prev_start_s, prev_end_s), + ) + prev_rows = {r[0]: (r[1], r[2]) for r in cur.fetchall()} + + error_spikes = [] + for host in set(recent_rows) | set(prev_rows): + r_err, r_total = recent_rows.get(host, (0, 0)) + p_err, p_total = prev_rows.get(host, (0, 0)) + r_rate = r_err * 100.0 / r_total if r_total else 0.0 + p_rate = p_err * 100.0 / p_total if p_total else 0.0 + if r_rate >= 10 and r_rate >= p_rate * 2: + error_spikes.append( + { + "host": host, + "recent_error_rate": round(r_rate, 2), + "previous_error_rate": round(p_rate, 2), + } + ) + + cur.execute( + """ + SELECT DISTINCT user_agent FROM logs + WHERE time >= ? AND time < ? + """, + (prev_start_s, prev_end_s), + ) + prev_agents = {r[0] for r in cur.fetchall()} + + cur.execute( + """ + SELECT user_agent, COUNT(*) AS c + FROM logs + WHERE time >= ? AND time < ? + GROUP BY user_agent + HAVING c >= 10 + """, + (recent_start_s, recent_end_s), + ) + suspicious_agents = [ + {"user_agent": ua, "requests": cnt} + for ua, cnt in cur.fetchall() + if ua not in prev_agents + ] + + cur.execute( + """ + SELECT ip, COUNT(*) AS c + FROM logs + WHERE time >= ? AND time < ? + GROUP BY ip + HAVING c >= ? + ORDER BY c DESC + """, + (recent_start_s, recent_end_s, ip_threshold), + ) + high_ip_requests = [ + {"ip": ip, "requests": cnt} for ip, cnt in cur.fetchall() + ] + + conn.close() + + report = { + "time_range": { + "recent_start": recent_start_s, + "recent_end": recent_end_s, + "previous_start": prev_start_s, + "previous_end": prev_end_s, + }, + "error_spikes": error_spikes, + "suspicious_agents": suspicious_agents, + "high_ip_requests": high_ip_requests, + } + + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "threat_report.json" + out_path.write_text(json.dumps(report, indent=2)) + typer.echo(json.dumps(report)) + + if __name__ == "__main__": app() diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 40bdc40..a4358d7 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -203,3 +203,122 @@ server { analyze.suggest_cache(threshold=2, json_output=True) out_json = json.loads(capsys.readouterr().out.strip()) assert out_json == [{"host": "example.com", "path": "/foo", "misses": 3}] + + +def setup_threat_db(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(path) + cur = conn.cursor() + cur.execute( + """ + CREATE TABLE logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + ) + """ + ) + + # Previous hour traffic with no errors + for i in range(10): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "2.2.2.2", + "example.com", + f"2024-01-01 11:{i:02d}:00", + "GET /ok HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + + # Recent hour with errors + for i in range(10): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "3.3.3.3", + "example.com", + f"2024-01-01 12:{i:02d}:00", + "GET /fail HTTP/1.1", + 500, + 100, + "-", + "curl", + "MISS", + ), + ) + + # High traffic from single IP + for i in range(101): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "1.1.1.1", + "example.net", + f"2024-01-01 12:{i % 10:02d}:30", + "GET /spam HTTP/1.1", + 200, + 100, + "-", + "curl", + "MISS", + ), + ) + + # New suspicious user agent + for i in range(15): + cur.execute( + "INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status)" + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + "4.4.4.4", + "example.org", + f"2024-01-01 12:{30 + i:02d}:45", + "GET /bot HTTP/1.1", + 200, + 100, + "-", + "newbot", + "MISS", + ), + ) + + conn.commit() + conn.close() + + +def test_detect_threats(tmp_path, monkeypatch): + db_path = tmp_path / "database" / "ngxstat.db" + setup_threat_db(db_path) + + out_dir = tmp_path / "analysis" + monkeypatch.setattr(analyze, "DB_PATH", db_path) + monkeypatch.setattr(analyze, "ANALYSIS_DIR", out_dir) + + analyze.detect_threats(hours=1, ip_threshold=100) + + report = json.loads((out_dir / "threat_report.json").read_text()) + + hosts = {e["host"] for e in report.get("error_spikes", [])} + assert "example.com" in hosts + + ips = {e["ip"] for e in report.get("high_ip_requests", [])} + assert "1.1.1.1" in ips + + agents = {e["user_agent"] for e in report.get("suspicious_agents", [])} + assert "newbot" in agents From 2e7e75e4ce8dfdb60506ea523d5c324a8d806069 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:19:08 -0500 Subject: [PATCH 02/34] Add run-analysis helper script and tests --- README.md | 11 +++++++++++ run-analysis.sh | 34 ++++++++++++++++++++++++++++++++++ tests/test_run_analysis.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100755 run-analysis.sh create mode 100644 tests/test_run_analysis.py diff --git a/README.md b/README.md index d206658..70f21d9 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,17 @@ Use the `run-reports.sh` script to run all report intervals in one step. The scr Running this script will create or update the hourly, daily, weekly and monthly reports under `output/`. It also detects all unique domains found in the database and writes per-domain reports to `output/domains//` alongside the aggregate data. After generation, open `output/index.html` in your browser to browse the reports. + +## Log Analysis + +The `run-analysis.sh` script runs helper routines that inspect the database. It +creates or reuses the virtual environment and then executes a set of analysis +commands to spot missing domains, suggest cache rules and detect potential +threats. + +```bash +./run-analysis.sh +``` ## Serving Reports with Nginx To expose the generated HTML dashboards and JSON files over HTTP you can use a diff --git a/run-analysis.sh b/run-analysis.sh new file mode 100755 index 0000000..1f5c213 --- /dev/null +++ b/run-analysis.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -e + +# Ensure virtual environment exists +if [ ! -d ".venv" ]; then + echo "[INFO] Creating virtual environment..." + python3 -m venv .venv + source .venv/bin/activate + echo "[INFO] Installing dependencies..." + pip install --upgrade pip + if [ -f requirements.txt ]; then + pip install -r requirements.txt + else + echo "[WARN] requirements.txt not found, skipping." + fi +else + echo "[INFO] Activating virtual environment..." + source .venv/bin/activate +fi + +# Run analysis helpers +echo "[INFO] Checking for missing domains..." +python scripts/analyze.py check-missing-domains + +echo "[INFO] Suggesting cache improvements..." +python scripts/analyze.py suggest-cache + +echo "[INFO] Detecting threats..." +python scripts/analyze.py detect-threats + +# Deactivate to keep cron environment clean +if type deactivate >/dev/null 2>&1; then + deactivate +fi diff --git a/tests/test_run_analysis.py b/tests/test_run_analysis.py new file mode 100644 index 0000000..60d2810 --- /dev/null +++ b/tests/test_run_analysis.py @@ -0,0 +1,32 @@ +import os +import subprocess +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPT = REPO_ROOT / "run-analysis.sh" + + +def test_script_invokes_commands(tmp_path): + # create stub virtualenv so the script skips creation + venv = tmp_path / ".venv" / "bin" + venv.mkdir(parents=True) + (venv / "activate").write_text(":") + + calls = tmp_path / "calls.txt" + python_stub = tmp_path / "python" + python_stub.write_text(f"#!/usr/bin/env bash\necho \"$*\" >> \"{calls}\"\n") + python_stub.chmod(0o755) + (tmp_path / "python3").write_text(f"#!/usr/bin/env bash\nexit 0\n") + (tmp_path / "python3").chmod(0o755) + + env = os.environ.copy() + env["PATH"] = f"{tmp_path}:" + env.get("PATH", "") + + subprocess.run([str(SCRIPT)], cwd=tmp_path, env=env, check=True) + + lines = calls.read_text().strip().splitlines() + assert lines == [ + "scripts/analyze.py check-missing-domains", + "scripts/analyze.py suggest-cache", + "scripts/analyze.py detect-threats", + ] From 9cf27ecb2f917b5a2ac91ded9c8b50a548ef2ba3 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:30:15 -0500 Subject: [PATCH 03/34] Add analysis tab and JSON outputs --- README.md | 2 + scripts/analyze.py | 19 +++++-- templates/index.html | 131 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 130 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 70f21d9..acb1055 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ threats. ```bash ./run-analysis.sh ``` +The JSON results are written under `output/analysis` and can be viewed from the +"Analysis" tab in the generated dashboard. ## Serving Reports with Nginx To expose the generated HTML dashboards and JSON files over HTTP you can use a diff --git a/scripts/analyze.py b/scripts/analyze.py index 219ceeb..8ac7c30 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -138,6 +138,10 @@ def check_missing_domains(json_output: bool = typer.Option(False, "--json", help missing = sorted(db_domains - config_domains) + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "missing_domains.json" + out_path.write_text(json.dumps(missing, indent=2)) + if json_output: typer.echo(json.dumps(missing)) else: @@ -189,14 +193,19 @@ def suggest_cache( rows = [r for r in cur.fetchall() if r[0] in no_cache] conn.close() + result = [ + {"host": host, "path": path, "misses": count} for host, path, count in rows + ] + + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + out_path = ANALYSIS_DIR / "cache_suggestions.json" + out_path.write_text(json.dumps(result, indent=2)) + if json_output: - result = [ - {"host": host, "path": path, "misses": count} for host, path, count in rows - ] typer.echo(json.dumps(result)) else: - for host, path, count in rows: - typer.echo(f"{host} {path} {count}") + for item in result: + typer.echo(f"{item['host']} {item['path']} {item['misses']}") @app.command("detect-threats") diff --git a/templates/index.html b/templates/index.html index 91482f7..7b0b98f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -15,6 +15,7 @@
  • Overview
  • All Domains
  • Per Domain
  • +
  • Analysis
  • @@ -56,9 +57,15 @@
    - + + + @@ -73,13 +80,19 @@ const sections = { overview: document.getElementById('overview-section'), all: document.getElementById('all-section'), - domain: document.getElementById('domain-section') + domain: document.getElementById('domain-section'), + analysis: document.getElementById('analysis-section') }; const containers = { overview: document.getElementById('overview-reports'), all: document.getElementById('reports-all'), domain: document.getElementById('reports-domain') }; + const analysisElems = { + missing: document.getElementById('analysis-missing'), + cache: document.getElementById('analysis-cache'), + threats: document.getElementById('analysis-threats') + }; const totalElem = document.getElementById('stat-total'); const startElem = document.getElementById('stat-start'); const endElem = document.getElementById('stat-end'); @@ -169,19 +182,99 @@ path = 'domains/' + encodeURIComponent(currentDomain) + '/' + currentInterval; } - fetch(path + '/reports.json') - .then(r => r.json()) - .then(reports => { - container.innerHTML = ''; - reports.forEach(rep => { - fetch(path + '/' + rep.html) - .then(r => r.text()) - .then(html => { - container.insertAdjacentHTML('beforeend', html); - initReport(rep, path); - }); + fetch(path + '/reports.json') + .then(r => r.json()) + .then(reports => { + container.innerHTML = ''; + reports.forEach(rep => { + fetch(path + '/' + rep.html) + .then(r => r.text()) + .then(html => { + container.insertAdjacentHTML('beforeend', html); + initReport(rep, path); + }); + }); + feather.replace(); }); - feather.replace(); + } + + function loadAnalysis() { + analysisElems.missing.innerHTML = '

    Missing Domains

    '; + analysisElems.cache.innerHTML = '

    Cache Suggestions

    '; + analysisElems.threats.innerHTML = '

    Threat Report

    '; + + fetch('analysis/missing_domains.json') + .then(r => r.json()) + .then(list => { + if (list.length === 0) { + analysisElems.missing.insertAdjacentHTML('beforeend', '

    None

    '); + return; + } + const items = list.map(d => `
  • ${d}
  • `).join(''); + analysisElems.missing.insertAdjacentHTML('beforeend', `
      ${items}
    `); + }); + + fetch('analysis/cache_suggestions.json') + .then(r => r.json()) + .then(data => { + if (data.length === 0) { + analysisElems.cache.insertAdjacentHTML('beforeend', '

    No suggestions

    '); + return; + } + analysisElems.cache.insertAdjacentHTML('beforeend', '
    '); + const rows = data.map(x => [x.host, x.path, x.misses]); + new DataTable('#table-cache', { + data: rows, + columns: [ + { title: 'Domain' }, + { title: 'Path' }, + { title: 'Misses' } + ] + }); + }); + + fetch('analysis/threat_report.json') + .then(r => r.json()) + .then(rep => { + const hasData = rep.error_spikes?.length || rep.suspicious_agents?.length || rep.high_ip_requests?.length; + if (!hasData) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    No threats detected

    '); + return; + } + if (rep.error_spikes && rep.error_spikes.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    Error Spikes

    '); + const rows = rep.error_spikes.map(x => [x.host, x.recent_error_rate, x.previous_error_rate]); + new DataTable('#table-errors', { + data: rows, + columns: [ + { title: 'Domain' }, + { title: 'Recent %' }, + { title: 'Previous %' } + ] + }); + } + if (rep.suspicious_agents && rep.suspicious_agents.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    Suspicious User Agents

    '); + const rows = rep.suspicious_agents.map(x => [x.user_agent, x.requests]); + new DataTable('#table-agents', { + data: rows, + columns: [ + { title: 'User Agent' }, + { title: 'Requests' } + ] + }); + } + if (rep.high_ip_requests && rep.high_ip_requests.length) { + analysisElems.threats.insertAdjacentHTML('beforeend', '

    High IP Requests

    '); + const rows = rep.high_ip_requests.map(x => [x.ip, x.requests]); + new DataTable('#table-ips', { + data: rows, + columns: [ + { title: 'IP' }, + { title: 'Requests' } + ] + }); + } }); } @@ -198,7 +291,11 @@ if (name === 'overview') { loadStats(); } - loadReports(); + if (name === 'analysis') { + loadAnalysis(); + } else { + loadReports(); + } } intervalSelect.addEventListener('change', () => { From 0a05f0c010d791c8fcf9f33a9787f224bbac7ae3 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 02:55:15 -0500 Subject: [PATCH 04/34] Fix module imports in run-analysis --- run-analysis.sh | 6 +++--- scripts/__init__.py | 1 + tests/test_run_analysis.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 scripts/__init__.py diff --git a/run-analysis.sh b/run-analysis.sh index 1f5c213..e487224 100755 --- a/run-analysis.sh +++ b/run-analysis.sh @@ -20,13 +20,13 @@ fi # Run analysis helpers echo "[INFO] Checking for missing domains..." -python scripts/analyze.py check-missing-domains +python -m scripts.analyze check-missing-domains echo "[INFO] Suggesting cache improvements..." -python scripts/analyze.py suggest-cache +python -m scripts.analyze suggest-cache echo "[INFO] Detecting threats..." -python scripts/analyze.py detect-threats +python -m scripts.analyze detect-threats # Deactivate to keep cron environment clean if type deactivate >/dev/null 2>&1; then diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..f4c57a1 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"Utility package for ngxstat scripts" diff --git a/tests/test_run_analysis.py b/tests/test_run_analysis.py index 60d2810..66d00df 100644 --- a/tests/test_run_analysis.py +++ b/tests/test_run_analysis.py @@ -26,7 +26,7 @@ def test_script_invokes_commands(tmp_path): lines = calls.read_text().strip().splitlines() assert lines == [ - "scripts/analyze.py check-missing-domains", - "scripts/analyze.py suggest-cache", - "scripts/analyze.py detect-threats", + "-m scripts.analyze check-missing-domains", + "-m scripts.analyze suggest-cache", + "-m scripts.analyze detect-threats", ] From 2443aecaf65c4bbd8b66a4cf7ee34dc8ece69ab7 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 03:17:07 -0500 Subject: [PATCH 05/34] fix nginx config discovery for absolute include patterns --- scripts/nginx_config.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/nginx_config.py b/scripts/nginx_config.py index e2dc447..dbd635d 100644 --- a/scripts/nginx_config.py +++ b/scripts/nginx_config.py @@ -49,7 +49,15 @@ def discover_configs() -> Set[Path]: found.add(path) for pattern in INCLUDE_RE.findall(text): pattern = os.path.expanduser(pattern.strip()) - for included in path.parent.glob(pattern): + if os.path.isabs(pattern): + # ``Path.glob`` does not allow absolute patterns, so we + # anchor at the filesystem root and remove the leading + # separator. + base = Path(os.sep) + glob_iter = base.glob(pattern.lstrip(os.sep)) + else: + glob_iter = path.parent.glob(pattern) + for included in glob_iter: if included.is_file() and included not in found: queue.append(included) return found From d1f3c5a9ae0fa66e56828743c7103eb2f967753d Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Sat, 19 Jul 2025 03:30:08 -0500 Subject: [PATCH 06/34] Hide analysis duration --- scripts/generate_reports.py | 2 +- templates/index.html | 2 +- tests/test_reports.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/generate_reports.py b/scripts/generate_reports.py index ae7a3a7..4b5692a 100644 --- a/scripts/generate_reports.py +++ b/scripts/generate_reports.py @@ -175,7 +175,7 @@ def _generate_root_index() -> None: intervals = [ p.name for p in OUTPUT_DIR.iterdir() - if p.is_dir() and p.name.lower() not in {"domains", "global"} + if p.is_dir() and p.name.lower() not in {"domains", "global", "analysis"} ] intervals.sort() diff --git a/templates/index.html b/templates/index.html index 7b0b98f..0a8abed 100644 --- a/templates/index.html +++ b/templates/index.html @@ -286,7 +286,7 @@ Object.entries(sections).forEach(([key, section]) => { section.classList.toggle('is-hidden', key !== name); }); - intervalControl.classList.toggle('is-hidden', name === 'overview'); + intervalControl.classList.toggle('is-hidden', name === 'overview' || name === 'analysis'); domainControl.classList.toggle('is-hidden', name !== 'domain'); if (name === 'overview') { loadStats(); diff --git a/tests/test_reports.py b/tests/test_reports.py index fec898c..dbe71c2 100644 --- a/tests/test_reports.py +++ b/tests/test_reports.py @@ -184,6 +184,8 @@ def test_generate_root_index(tmp_path, sample_reports, monkeypatch): (tmp_path / "output" / "domains" / "bar.com").mkdir(parents=True) # add an extra directory with capitalized name to ensure it's ignored (tmp_path / "output" / "Global").mkdir(parents=True) + # add an analysis directory to ensure it's excluded + (tmp_path / "output" / "analysis").mkdir(parents=True) gr._generate_root_index() @@ -196,6 +198,7 @@ def test_generate_root_index(tmp_path, sample_reports, monkeypatch): assert '