diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..95f9808 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +exclude = .git, .venv, output, static/icons +max-line-length = 160 diff --git a/.forgejo/workflows/ci.yml b/.forgejo/workflows/ci.yml new file mode 100644 index 0000000..5cf26be --- /dev/null +++ b/.forgejo/workflows/ci.yml @@ -0,0 +1,151 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + ci: + name: Lint, test, and build + # This label must match your Forgejo runner's label + runs-on: docker + # Use a clean Debian container so tools are predictable + container: debian:stable-slim + env: + PYTHONDONTWRITEBYTECODE: "1" + PIP_DISABLE_PIP_VERSION_CHECK: "1" + UV_SYSTEM_PYTHON: "1" + steps: + - name: Install build tooling + run: | + set -euo pipefail + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + git ca-certificates python3 python3-venv python3-pip python3-setuptools \ + python3-wheel sqlite3 + update-ca-certificates || true + + - name: Checkout repository (manual) + run: | + set -euo pipefail + if [ -f Makefile ] || [ -d .git ]; then + echo "Repository present in workspace; skipping clone" + exit 0 + fi + REMOTE_URL="${CI_REPOSITORY_URL:-}" + if [ -z "$REMOTE_URL" ]; then + if [ -n "${GITHUB_SERVER_URL:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + REMOTE_URL="${GITHUB_SERVER_URL%/}/${GITHUB_REPOSITORY}.git" + elif [ -n "${GITHUB_REPOSITORY:-}" ]; then + REMOTE_URL="https://git.jordanwages.com/${GITHUB_REPOSITORY}.git" + else + echo "Unable to determine repository URL from CI environment" >&2 + exit 1 + fi + fi + AUTH_URL="$REMOTE_URL" + if [ -n "${GITHUB_TOKEN:-}" ]; then + ACTOR="${GITHUB_ACTOR:-oauth2}" + AUTH_URL=$(printf '%s' "$REMOTE_URL" | sed -E "s#^https://#https://${ACTOR}:${GITHUB_TOKEN}@#") + fi + echo "Cloning from: $REMOTE_URL" + if ! git clone --depth 1 "$AUTH_URL" .; then + echo "Auth clone failed; trying anonymous clone..." >&2 + git clone --depth 1 "$REMOTE_URL" . + fi + if [ -n "${GITHUB_SHA:-}" ]; then + git fetch --depth 1 origin "$GITHUB_SHA" || true + git checkout -q "$GITHUB_SHA" || true + elif [ -n "${GITHUB_REF_NAME:-}" ]; then + git fetch --depth 1 origin "$GITHUB_REF_NAME" || true + git checkout -q "$GITHUB_REF_NAME" || true + fi + + - name: Set up venv and install deps + run: | + set -euo pipefail + # Prefer persistent cache if runner provides /cache + USE_CACHE=0 + if [ -d /cache ] && [ -w /cache ]; then + export PIP_CACHE_DIR=/cache/pip + mkdir -p "$PIP_CACHE_DIR" + REQ_HASH=$(sha256sum requirements.txt | awk '{print $1}') + PYVER=$(python3 -c 'import sys;print(".".join(map(str, sys.version_info[:2])))') + CACHE_VENV="/cache/venv-${REQ_HASH}-py${PYVER}" + if [ ! -f "$CACHE_VENV/bin/activate" ]; then + echo "Preparing cached virtualenv: $CACHE_VENV" + rm -rf "$CACHE_VENV" || true + python3 -m venv "$CACHE_VENV" + fi + ln -sfn "$CACHE_VENV" .venv + USE_CACHE=1 + else + # Fallback to local venv + python3 -m venv .venv + fi + + # If the link didn't produce an activate file, fallback to local venv + if [ ! -f .venv/bin/activate ]; then + echo "Cached venv missing; creating local .venv" + rm -f .venv + python3 -m venv .venv + USE_CACHE=0 + fi + + . .venv/bin/activate + python -m pip install --upgrade pip + if [ "$USE_CACHE" = "1" ]; then + # Ensure required packages are present; pip will use cache + pip install -r requirements.txt pytest || pip install -r requirements.txt pytest + else + pip install -r requirements.txt pytest + fi + + - name: Format check (black) + run: | + . .venv/bin/activate + black --check . + + - name: Lint (flake8) + run: | + . .venv/bin/activate + flake8 . + + - name: Run tests (pytest) + run: | + . .venv/bin/activate + export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}" + pytest -q --maxfail=1 + + - name: Build sample reports (no artifact upload) + run: | + set -euo pipefail + . .venv/bin/activate + python - <<'PY' + import sqlite3, pathlib + db = pathlib.Path('database/ngxstat.db') + db.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db) + cur = conn.cursor() + cur.execute('''CREATE TABLE IF NOT EXISTS logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + )''') + cur.execute("INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES ('127.0.0.1','example.com','2024-01-01 10:00:00','GET / HTTP/1.1',200,100,'-','curl','MISS')") + cur.execute("INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES ('127.0.0.1','example.com','2024-01-01 10:05:00','GET /about HTTP/1.1',200,100,'-','curl','MISS')") + conn.commit(); conn.close() + PY + python scripts/generate_reports.py global + python scripts/generate_reports.py hourly + python scripts/generate_reports.py index + tar -czf ngxstat-reports.tar.gz -C output . + echo "Built sample reports archive: ngxstat-reports.tar.gz" diff --git a/README.md b/README.md index f641d96..ac601fc 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,10 @@ all intervals in one go: ``` The script calls `scripts/generate_reports.py` internally to create hourly, -daily, weekly and monthly reports. Per-domain reports are written under -`output/domains/` alongside the aggregate data. Open -`output/index.html` in a browser to view the dashboard. +daily, weekly and monthly reports, then writes analysis JSON files used by the +"Analysis" tab. Per-domain reports are written under `output/domains/` +alongside the aggregate data. Open `output/index.html` in a browser to view the +dashboard. If you prefer to run individual commands you can invoke the generator directly: @@ -54,8 +55,14 @@ python scripts/generate_reports.py daily --all-domains `run-analysis.sh` executes additional utilities that examine the database for missing domains, caching opportunities and potential threats. The JSON output is -saved under `output/analysis` and appears in the "Analysis" tab of the -dashboard. +saved under `output/analysis` and appears in the "Analysis" tab. The +`run-reports.sh` script also generates these JSON files as part of the build. + +## UX Controls + +The dashboard defaults to a 7‑day window for time series. Your view preferences +persist locally in the browser under the `ngxstat-state-v2` key. Use the +"Reset view" button to clear saved state and restore defaults. ```bash ./run-analysis.sh diff --git a/reports.yml b/reports.yml index 1ae8e6f..709d686 100644 --- a/reports.yml +++ b/reports.yml @@ -48,6 +48,7 @@ label: Top Domains icon: globe chart: table + top_n: 50 per_domain: false bucket: domain bucket_label: Domain @@ -75,6 +76,7 @@ label: Top Paths icon: map chart: table + top_n: 50 buckets: - domain - path @@ -102,6 +104,7 @@ label: User Agents icon: user chart: table + top_n: 50 buckets: - domain - user_agent @@ -127,6 +130,7 @@ label: Referrers icon: link chart: table + top_n: 50 buckets: - domain - referrer @@ -170,3 +174,40 @@ - "#209cee" - "#ffdd57" - "#f14668" + +# New time-series: status classes over time (stacked) +- name: status_classes_timeseries + label: Status Classes Over Time + icon: server + chart: stackedBar + bucket: time_bucket + bucket_label: Time + stacked: true + query: | + SELECT {bucket} AS time_bucket, + SUM(CASE WHEN status BETWEEN 200 AND 299 THEN 1 ELSE 0 END) AS "2xx", + SUM(CASE WHEN status BETWEEN 300 AND 399 THEN 1 ELSE 0 END) AS "3xx", + SUM(CASE WHEN status BETWEEN 400 AND 499 THEN 1 ELSE 0 END) AS "4xx", + SUM(CASE WHEN status BETWEEN 500 AND 599 THEN 1 ELSE 0 END) AS "5xx", + COUNT(*) AS total + FROM logs + GROUP BY time_bucket + ORDER BY time_bucket + +# New time-series: cache status over time (compact Hit/Miss; exclude '-' by default) +- name: cache_status_timeseries + label: Cache Status Over Time + icon: archive + chart: stackedBar + bucket: time_bucket + bucket_label: Time + stacked: true + exclude_values: ["-"] + query: | + SELECT {bucket} AS time_bucket, + SUM(CASE WHEN cache_status = 'HIT' THEN 1 ELSE 0 END) AS hit, + SUM(CASE WHEN cache_status = 'MISS' THEN 1 ELSE 0 END) AS miss, + COUNT(*) AS total + FROM logs + GROUP BY time_bucket + ORDER BY time_bucket diff --git a/run-reports.sh b/run-reports.sh index bfe736d..f7cffba 100755 --- a/run-reports.sh +++ b/run-reports.sh @@ -29,21 +29,25 @@ fi # Generate reports for all domains combined echo "[INFO] Generating aggregate reports..." -python scripts/generate_reports.py hourly -python scripts/generate_reports.py daily -python scripts/generate_reports.py weekly -python scripts/generate_reports.py monthly -python scripts/generate_reports.py global +python -m scripts.generate_reports hourly +python -m scripts.generate_reports daily +python -m scripts.generate_reports weekly +python -m scripts.generate_reports monthly +python -m scripts.generate_reports global # Generate reports for each individual domain echo "[INFO] Generating per-domain reports..." -python scripts/generate_reports.py hourly --all-domains -python scripts/generate_reports.py daily --all-domains -python scripts/generate_reports.py weekly --all-domains -python scripts/generate_reports.py monthly --all-domains +python -m scripts.generate_reports hourly --all-domains +python -m scripts.generate_reports daily --all-domains +python -m scripts.generate_reports weekly --all-domains +python -m scripts.generate_reports monthly --all-domains + +# Generate analysis JSON +echo "[INFO] Generating analysis files..." +python -m scripts.generate_reports analysis # Generate root index -python scripts/generate_reports.py index +python -m scripts.generate_reports index # Deactivate to keep cron environment clean if type deactivate >/dev/null 2>&1; then diff --git a/scripts/analyze.py b/scripts/analyze.py index fe7b818..9f49978 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -18,7 +18,7 @@ from __future__ import annotations import sqlite3 from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import List, Optional, Set from datetime import datetime, timedelta import json @@ -155,10 +155,9 @@ def check_missing_domains( typer.echo(d) -@app.command("suggest-cache") def suggest_cache( - threshold: int = typer.Option(10, help="Minimum number of MISS entries to report"), - json_output: bool = typer.Option(False, "--json", help="Output results as JSON"), + threshold: int = 10, + json_output: bool = False, ) -> None: """Suggest domain/path pairs that could benefit from caching. @@ -191,7 +190,7 @@ def suggest_cache( HAVING miss_count >= ? ORDER BY miss_count DESC """, - (threshold,), + (int(threshold),), ) rows = [r for r in cur.fetchall() if r[0] in no_cache] @@ -211,11 +210,18 @@ def suggest_cache( for item in result: typer.echo(f"{item['host']} {item['path']} {item['misses']}") +@app.command("suggest-cache") +def suggest_cache_cli( + threshold: int = typer.Option(10, help="Minimum number of MISS entries to report"), + json_output: bool = typer.Option(False, "--json", help="Output results as JSON"), +) -> None: + """CLI wrapper for suggest_cache.""" + suggest_cache(threshold=threshold, json_output=json_output) + -@app.command("detect-threats") def detect_threats( - hours: int = typer.Option(1, help="Number of recent hours to analyze"), - ip_threshold: int = typer.Option(100, help="Requests from a single IP to flag"), + hours: int = 1, + ip_threshold: int = 100, ) -> None: """Detect potential security threats from recent logs.""" @@ -231,8 +237,8 @@ def detect_threats( max_dt = datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") recent_end = max_dt - recent_start = recent_end - timedelta(hours=hours) - prev_start = recent_start - timedelta(hours=hours) + recent_start = recent_end - timedelta(hours=int(hours)) + prev_start = recent_start - timedelta(hours=int(hours)) prev_end = recent_start fmt = "%Y-%m-%d %H:%M:%S" @@ -339,6 +345,14 @@ def detect_threats( out_path.write_text(json.dumps(report, indent=2)) typer.echo(json.dumps(report)) +@app.command("detect-threats") +def detect_threats_cli( + hours: int = typer.Option(1, help="Number of recent hours to analyze"), + ip_threshold: int = typer.Option(100, help="Requests from a single IP to flag"), +) -> None: + """CLI wrapper for detect_threats.""" + detect_threats(hours=hours, ip_threshold=ip_threshold) + if __name__ == "__main__": app() diff --git a/scripts/generate_reports.py b/scripts/generate_reports.py index 265da2d..d3c2f8a 100644 --- a/scripts/generate_reports.py +++ b/scripts/generate_reports.py @@ -1,9 +1,10 @@ import json +import sys import sqlite3 from pathlib import Path import shutil from typing import List, Dict, Optional -from datetime import datetime +from datetime import datetime, timezone import time import yaml @@ -11,10 +12,16 @@ import yaml import typer from jinja2 import Environment, FileSystemLoader +# Ensure project root is importable when running as a script (python scripts/generate_reports.py) +PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + DB_PATH = Path("database/ngxstat.db") OUTPUT_DIR = Path("output") TEMPLATE_DIR = Path("templates") REPORT_CONFIG = Path("reports.yml") +GENERATED_MARKER = OUTPUT_DIR / "generated.txt" # Mapping of interval names to SQLite strftime formats. These strings are # substituted into report queries whenever the special ``{bucket}`` token is @@ -30,6 +37,19 @@ INTERVAL_FORMATS = { app = typer.Typer(help="Generate aggregated log reports") +@app.callback() +def _cli_callback(ctx: typer.Context) -> None: + """Register post-command hook to note generation time.""" + + def _write_marker() -> None: + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + # Use timezone-aware UTC to avoid deprecation warnings and ambiguity + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + GENERATED_MARKER.write_text(f"{timestamp}\n") + + ctx.call_on_close(_write_marker) + + def _get_domains() -> List[str]: """Return a sorted list of unique domains from the logs table.""" conn = sqlite3.connect(DB_PATH) @@ -165,6 +185,16 @@ def _generate_interval(interval: str, domain: Optional[str] = None) -> None: name = definition["name"] query = definition["query"].replace("{bucket}", bucket) query = query.replace("FROM logs", "FROM logs_view") + # Apply top_n limit for tables (performance-friendly), if configured + top_n = definition.get("top_n") + chart_type = definition.get("chart", "line") + if top_n and chart_type == "table": + try: + n = int(top_n) + if "LIMIT" not in query.upper(): + query = f"{query}\nLIMIT {n}" + except Exception: + pass cur.execute(query) rows = cur.fetchall() headers = [c[0] for c in cur.description] @@ -190,6 +220,18 @@ def _generate_interval(interval: str, domain: Optional[str] = None) -> None: entry["color"] = definition["color"] if "colors" in definition: entry["colors"] = definition["colors"] + # Optional UX metadata passthrough for frontend-only transforms + for key in ( + "windows_supported", + "window_default", + "group_others_threshold", + "exclude_values", + "top_n", + "stacked", + "palette", + ): + if key in definition: + entry[key] = definition[key] _render_snippet(entry, out_dir) report_list.append(entry) @@ -236,7 +278,8 @@ def _generate_global() -> None: return start_time = time.time() - generated_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + # Use timezone-aware UTC for generated_at (string remains unchanged format) + generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") _copy_icons() @@ -253,6 +296,16 @@ def _generate_global() -> None: name = definition["name"] query = definition["query"] + # Apply top_n limit for tables (performance-friendly), if configured + top_n = definition.get("top_n") + chart_type = definition.get("chart", "line") + if top_n and chart_type == "table": + try: + n = int(top_n) + if "LIMIT" not in query.upper(): + query = f"{query}\nLIMIT {n}" + except Exception: + pass cur.execute(query) rows = cur.fetchall() headers = [c[0] for c in cur.description] @@ -278,6 +331,18 @@ def _generate_global() -> None: entry["color"] = definition["color"] if "colors" in definition: entry["colors"] = definition["colors"] + # Optional UX metadata passthrough for frontend-only transforms + for key in ( + "windows_supported", + "window_default", + "group_others_threshold", + "exclude_values", + "top_n", + "stacked", + "palette", + ): + if key in definition: + entry[key] = definition[key] _render_snippet(entry, out_dir) report_list.append(entry) @@ -287,6 +352,34 @@ def _generate_global() -> None: typer.echo("Generated global reports") +def _generate_analysis() -> None: + """Generate analysis JSON files consumed by the Analysis tab.""" + try: + # Import lazily to avoid circulars and keep dependencies optional + from scripts import analyze + except Exception as exc: # pragma: no cover - defensive + typer.echo(f"Failed to import analysis module: {exc}") + return + + # Ensure output root and icons present for parity + _copy_icons() + + # These commands write JSON files under output/analysis/ + try: + analyze.check_missing_domains(json_output=True) + except Exception as exc: # pragma: no cover - continue best-effort + typer.echo(f"check_missing_domains failed: {exc}") + try: + analyze.suggest_cache(json_output=True) + except Exception as exc: # pragma: no cover + typer.echo(f"suggest_cache failed: {exc}") + try: + analyze.detect_threats() + except Exception as exc: # pragma: no cover + typer.echo(f"detect_threats failed: {exc}") + typer.echo("Generated analysis JSON files") + + @app.command() def hourly( domain: Optional[str] = typer.Option( @@ -357,6 +450,12 @@ def global_reports() -> None: _generate_global() +@app.command() +def analysis() -> None: + """Generate analysis JSON files for the Analysis tab.""" + _generate_analysis() + + @app.command() def index() -> None: """Generate the root index page linking all reports.""" diff --git a/static/chartManager.js b/static/chartManager.js index 79d83fc..2f14f4f 100644 --- a/static/chartManager.js +++ b/static/chartManager.js @@ -47,3 +47,63 @@ export function reset(container) { }); container.innerHTML = ''; } + +// ---- Lightweight client-side data helpers ---- + +// Slice last N rows from a time-ordered array +export function sliceWindow(data, n) { + if (!Array.isArray(data) || n === undefined || n === null) return data; + if (n === 'all') return data; + const count = Number(n); + if (!Number.isFinite(count) || count <= 0) return data; + return data.slice(-count); +} + +// Exclude rows whose value in key is in excluded list +export function excludeValues(data, key, excluded = []) { + if (!excluded || excluded.length === 0) return data; + const set = new Set(excluded); + return data.filter(row => !set.has(row[key])); +} + +// Compute percentages for categorical distributions (valueKey default 'value') +export function toPercent(data, valueKey = 'value') { + const total = data.reduce((s, r) => s + (Number(r[valueKey]) || 0), 0); + if (total <= 0) return data.map(r => ({ ...r })); + return data.map(r => ({ ...r, [valueKey]: (Number(r[valueKey]) || 0) * 100 / total })); +} + +// Group categories with share < threshold into an 'Other' bucket. +export function groupOthers(data, bucketKey, valueKey = 'value', threshold = 0.03, otherLabel = 'Other') { + if (!Array.isArray(data) || data.length === 0) return data; + const total = data.reduce((s, r) => s + (Number(r[valueKey]) || 0), 0); + if (total <= 0) return data; + const major = []; + let other = 0; + for (const r of data) { + const v = Number(r[valueKey]) || 0; + if (total && v / total < threshold) { + other += v; + } else { + major.push({ ...r }); + } + } + if (other > 0) major.push({ [bucketKey]: otherLabel, [valueKey]: other }); + return major; +} + +// Simple moving average over numeric array +export function movingAverage(series, span = 3) { + const n = Math.max(1, Number(span) || 1); + const out = []; + for (let i = 0; i < series.length; i++) { + const start = Math.max(0, i - n + 1); + let sum = 0, cnt = 0; + for (let j = start; j <= i; j++) { + const v = Number(series[j]); + if (Number.isFinite(v)) { sum += v; cnt++; } + } + out.push(cnt ? sum / cnt : null); + } + return out; +} diff --git a/templates/index.html b/templates/index.html index edb53f6..a5de3db 100644 --- a/templates/index.html +++ b/templates/index.html @@ -12,14 +12,15 @@ -
+
+ + + + + +
+ +
-
+
-

Overview

+

Recent

Total logs: -

Date range: - to -

Unique domains: -

Last generated: -

Generation time: - seconds

+ +
-