diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..95f9808 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +exclude = .git, .venv, output, static/icons +max-line-length = 160 diff --git a/.forgejo/workflows/ci.yml b/.forgejo/workflows/ci.yml new file mode 100644 index 0000000..5cf26be --- /dev/null +++ b/.forgejo/workflows/ci.yml @@ -0,0 +1,151 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + ci: + name: Lint, test, and build + # This label must match your Forgejo runner's label + runs-on: docker + # Use a clean Debian container so tools are predictable + container: debian:stable-slim + env: + PYTHONDONTWRITEBYTECODE: "1" + PIP_DISABLE_PIP_VERSION_CHECK: "1" + UV_SYSTEM_PYTHON: "1" + steps: + - name: Install build tooling + run: | + set -euo pipefail + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + git ca-certificates python3 python3-venv python3-pip python3-setuptools \ + python3-wheel sqlite3 + update-ca-certificates || true + + - name: Checkout repository (manual) + run: | + set -euo pipefail + if [ -f Makefile ] || [ -d .git ]; then + echo "Repository present in workspace; skipping clone" + exit 0 + fi + REMOTE_URL="${CI_REPOSITORY_URL:-}" + if [ -z "$REMOTE_URL" ]; then + if [ -n "${GITHUB_SERVER_URL:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + REMOTE_URL="${GITHUB_SERVER_URL%/}/${GITHUB_REPOSITORY}.git" + elif [ -n "${GITHUB_REPOSITORY:-}" ]; then + REMOTE_URL="https://git.jordanwages.com/${GITHUB_REPOSITORY}.git" + else + echo "Unable to determine repository URL from CI environment" >&2 + exit 1 + fi + fi + AUTH_URL="$REMOTE_URL" + if [ -n "${GITHUB_TOKEN:-}" ]; then + ACTOR="${GITHUB_ACTOR:-oauth2}" + AUTH_URL=$(printf '%s' "$REMOTE_URL" | sed -E "s#^https://#https://${ACTOR}:${GITHUB_TOKEN}@#") + fi + echo "Cloning from: $REMOTE_URL" + if ! git clone --depth 1 "$AUTH_URL" .; then + echo "Auth clone failed; trying anonymous clone..." >&2 + git clone --depth 1 "$REMOTE_URL" . + fi + if [ -n "${GITHUB_SHA:-}" ]; then + git fetch --depth 1 origin "$GITHUB_SHA" || true + git checkout -q "$GITHUB_SHA" || true + elif [ -n "${GITHUB_REF_NAME:-}" ]; then + git fetch --depth 1 origin "$GITHUB_REF_NAME" || true + git checkout -q "$GITHUB_REF_NAME" || true + fi + + - name: Set up venv and install deps + run: | + set -euo pipefail + # Prefer persistent cache if runner provides /cache + USE_CACHE=0 + if [ -d /cache ] && [ -w /cache ]; then + export PIP_CACHE_DIR=/cache/pip + mkdir -p "$PIP_CACHE_DIR" + REQ_HASH=$(sha256sum requirements.txt | awk '{print $1}') + PYVER=$(python3 -c 'import sys;print(".".join(map(str, sys.version_info[:2])))') + CACHE_VENV="/cache/venv-${REQ_HASH}-py${PYVER}" + if [ ! -f "$CACHE_VENV/bin/activate" ]; then + echo "Preparing cached virtualenv: $CACHE_VENV" + rm -rf "$CACHE_VENV" || true + python3 -m venv "$CACHE_VENV" + fi + ln -sfn "$CACHE_VENV" .venv + USE_CACHE=1 + else + # Fallback to local venv + python3 -m venv .venv + fi + + # If the link didn't produce an activate file, fallback to local venv + if [ ! -f .venv/bin/activate ]; then + echo "Cached venv missing; creating local .venv" + rm -f .venv + python3 -m venv .venv + USE_CACHE=0 + fi + + . .venv/bin/activate + python -m pip install --upgrade pip + if [ "$USE_CACHE" = "1" ]; then + # Ensure required packages are present; pip will use cache + pip install -r requirements.txt pytest || pip install -r requirements.txt pytest + else + pip install -r requirements.txt pytest + fi + + - name: Format check (black) + run: | + . .venv/bin/activate + black --check . + + - name: Lint (flake8) + run: | + . .venv/bin/activate + flake8 . + + - name: Run tests (pytest) + run: | + . .venv/bin/activate + export PYTHONPATH="$(pwd)${PYTHONPATH:+:$PYTHONPATH}" + pytest -q --maxfail=1 + + - name: Build sample reports (no artifact upload) + run: | + set -euo pipefail + . .venv/bin/activate + python - <<'PY' + import sqlite3, pathlib + db = pathlib.Path('database/ngxstat.db') + db.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db) + cur = conn.cursor() + cur.execute('''CREATE TABLE IF NOT EXISTS logs ( + id INTEGER PRIMARY KEY, + ip TEXT, + host TEXT, + time TEXT, + request TEXT, + status INTEGER, + bytes_sent INTEGER, + referer TEXT, + user_agent TEXT, + cache_status TEXT + )''') + cur.execute("INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES ('127.0.0.1','example.com','2024-01-01 10:00:00','GET / HTTP/1.1',200,100,'-','curl','MISS')") + cur.execute("INSERT INTO logs (ip, host, time, request, status, bytes_sent, referer, user_agent, cache_status) VALUES ('127.0.0.1','example.com','2024-01-01 10:05:00','GET /about HTTP/1.1',200,100,'-','curl','MISS')") + conn.commit(); conn.close() + PY + python scripts/generate_reports.py global + python scripts/generate_reports.py hourly + python scripts/generate_reports.py index + tar -czf ngxstat-reports.tar.gz -C output . + echo "Built sample reports archive: ngxstat-reports.tar.gz" diff --git a/AGENTS.md b/AGENTS.md index 4cdfa62..7e7d3c5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,6 +24,9 @@ This document outlines general practices and expectations for AI agents assistin The `run-import.sh` script can initialize this environment automatically. Always activate the virtual environment before running scripts or tests. +* Before committing code run `black` for consistent formatting and execute + the test suite with `pytest`. All tests should pass. + * Dependency management: Use `requirements.txt` or `pip-tools` * Use standard libraries where feasible (e.g., `sqlite3`, `argparse`, `datetime`) * Adopt `typer` for CLI command interface (if CLI ergonomics matter) @@ -89,6 +92,14 @@ ngxstat/ If uncertain, the agent should prompt the human for clarification before making architectural assumptions. +## Testing + +Use `pytest` for automated tests. Run the suite from an activated virtual environment and ensure all tests pass before committing: + +```bash +pytest -q +``` + --- ## Future Capabilities @@ -106,3 +117,4 @@ As the project matures, agents may also: * **2025-07-17**: Initial version by Jordan + ChatGPT * **2025-07-17**: Expanded virtual environment usage guidance + diff --git a/README.md b/README.md index acb1055..ac601fc 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,16 @@ # ngxstat -Per-domain Nginx log analytics with hybrid static reports and live insights. -## Generating Reports +`ngxstat` is a lightweight log analytics toolkit for Nginx. It imports access +logs into an SQLite database and renders static dashboards so you can explore +per-domain metrics without running a heavy backend service. -Use the `generate_reports.py` script to build aggregated JSON and HTML snippet files from `database/ngxstat.db`. +## Requirements -Create a virtual environment and install dependencies: +* Python 3.10+ +* Access to the Nginx log files (default: `/var/log/nginx`) + +The helper scripts create a virtual environment on first run, but you can also +set one up manually: ```bash python3 -m venv .venv @@ -13,118 +18,95 @@ source .venv/bin/activate pip install -r requirements.txt ``` -Then run one or more of the interval commands: - -```bash -python scripts/generate_reports.py hourly -python scripts/generate_reports.py daily -python scripts/generate_reports.py weekly -python scripts/generate_reports.py monthly -``` - -Each command accepts optional flags to generate per-domain reports. Use -`--domain ` to limit output to a specific domain or `--all-domains` -to generate a subdirectory for every domain found in the database: - -```bash -# Hourly reports for example.com only -python scripts/generate_reports.py hourly --domain example.com - -# Weekly reports for all domains individually -python scripts/generate_reports.py weekly --all-domains -``` - -Reports are written under the `output/` directory. Each command updates the corresponding `.json` file and writes one HTML snippet per report. These snippets are loaded dynamically by the main dashboard using Chart.js and DataTables. - -### Configuring Reports - -Report queries are defined in `reports.yml`. Each entry specifies the `name`, -optional `label` and `chart` type, and a SQL `query` that must return `bucket` -and `value` columns. The special token `{bucket}` is replaced with the -appropriate SQLite `strftime` expression for each interval (hourly, daily, -weekly or monthly) so that a single definition works across all durations. -When `generate_reports.py` runs, every definition is executed for the requested -interval and creates `output//.json` plus a small HTML snippet -`output//.html` used by the dashboard. - -Example snippet: - -```yaml -- name: hits - chart: bar - query: | - SELECT {bucket} AS bucket, - COUNT(*) AS value - FROM logs - GROUP BY bucket - ORDER BY bucket -``` - -Add or modify entries in `reports.yml` to tailor the generated metrics. - ## Importing Logs -Use the `run-import.sh` script to set up the Python environment if needed and import the latest Nginx log entries into `database/ngxstat.db`. +Run the importer to ingest new log entries into `database/ngxstat.db`: ```bash ./run-import.sh ``` -This script is suitable for cron jobs as it creates the virtual environment on first run, installs dependencies and reuses the environment on subsequent runs. +Rotated logs are processed in order and only entries newer than the last +imported timestamp are added. -The importer handles rotated logs in order from oldest to newest so entries are -processed exactly once. If you rerun the script, it only ingests records with a -timestamp newer than the latest one already stored in the database, preventing -duplicates. +## Generating Reports -## Cron Report Generation - -Use the `run-reports.sh` script to run all report intervals in one step. The script sets up the Python environment the same way as `run-import.sh`, making it convenient for automation via cron. +To build the HTML dashboard and JSON data files use `run-reports.sh` which runs +all intervals in one go: ```bash ./run-reports.sh ``` -Running this script will create or update the hourly, daily, weekly and monthly reports under `output/`. It also detects all unique domains found in the database and writes per-domain reports to `output/domains//` alongside the aggregate data. After generation, open `output/index.html` in your browser to browse the reports. +The script calls `scripts/generate_reports.py` internally to create hourly, +daily, weekly and monthly reports, then writes analysis JSON files used by the +"Analysis" tab. Per-domain reports are written under `output/domains/` +alongside the aggregate data. Open `output/index.html` in a browser to view the +dashboard. +If you prefer to run individual commands you can invoke the generator directly: -## Log Analysis +```bash +python scripts/generate_reports.py hourly +python scripts/generate_reports.py daily --all-domains +``` -The `run-analysis.sh` script runs helper routines that inspect the database. It -creates or reuses the virtual environment and then executes a set of analysis -commands to spot missing domains, suggest cache rules and detect potential -threats. +## Analysis Helpers + +`run-analysis.sh` executes additional utilities that examine the database for +missing domains, caching opportunities and potential threats. The JSON output is +saved under `output/analysis` and appears in the "Analysis" tab. The +`run-reports.sh` script also generates these JSON files as part of the build. + +## UX Controls + +The dashboard defaults to a 7‑day window for time series. Your view preferences +persist locally in the browser under the `ngxstat-state-v2` key. Use the +"Reset view" button to clear saved state and restore defaults. ```bash ./run-analysis.sh ``` -The JSON results are written under `output/analysis` and can be viewed from the -"Analysis" tab in the generated dashboard. -## Serving Reports with Nginx -To expose the generated HTML dashboards and JSON files over HTTP you can use a -simple Nginx server block. Point the `root` directive to the repository's -`output/` directory and optionally restrict access to your local network. +## Serving the Reports + +The generated files are static. You can serve them with a simple Nginx block: ```nginx server { listen 80; server_name example.com; - - # Path to the generated reports root /path/to/ngxstat/output; location / { try_files $uri $uri/ =404; } - - # Allow access only from private networks - allow 192.0.0.0/8; - allow 10.0.0.0/8; - deny all; } ``` -With this configuration the generated static files are served directly by -Nginx while connections outside of `192.*` and `10.*` are denied. +Restrict access if the reports should not be public. +## Running Tests + +Install the development dependencies and execute the suite with `pytest`: + +```bash +pip install -r requirements.txt +pytest -q +``` + +All tests must pass before submitting changes. + +## Acknowledgements + +ngxstat uses the following third‑party resources: + +* [Chart.js](https://www.chartjs.org/) for charts +* [DataTables](https://datatables.net/) and [jQuery](https://jquery.com/) for table views +* [Bulma CSS](https://bulma.io/) for styling +* Icons from [Free CC0 Icons](https://cc0-icons.jonh.eu/) by Jon Hicks (CC0 / MIT) +* [Typer](https://typer.tiangolo.com/) for the command-line interface +* [Jinja2](https://palletsprojects.com/p/jinja/) for templating + +The project is licensed under the GPLv3. Icon assets remain in the public domain +via the CC0 license. diff --git a/reports.yml b/reports.yml index 1622538..709d686 100644 --- a/reports.yml +++ b/reports.yml @@ -2,30 +2,36 @@ label: Hits icon: pulse chart: line + bucket: time_bucket + bucket_label: Time query: | - SELECT {bucket} AS bucket, + SELECT {bucket} AS time_bucket, COUNT(*) AS value FROM logs - GROUP BY bucket - ORDER BY bucket + GROUP BY time_bucket + ORDER BY time_bucket - name: error_rate label: Error Rate (%) icon: file-alert chart: line + bucket: time_bucket + bucket_label: Time query: | - SELECT {bucket} AS bucket, + SELECT {bucket} AS time_bucket, SUM(CASE WHEN status BETWEEN 400 AND 599 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS value FROM logs - GROUP BY bucket - ORDER BY bucket + GROUP BY time_bucket + ORDER BY time_bucket - name: cache_status_breakdown label: Cache Status icon: archive chart: polarArea + bucket: cache_status + bucket_label: Cache Status query: | - SELECT cache_status AS bucket, + SELECT cache_status AS cache_status, COUNT(*) AS value FROM logs GROUP BY cache_status @@ -42,82 +48,166 @@ label: Top Domains icon: globe chart: table + top_n: 50 per_domain: false + bucket: domain + bucket_label: Domain query: | - SELECT host AS bucket, + SELECT host AS domain, COUNT(*) AS value FROM logs - GROUP BY host + GROUP BY domain ORDER BY value DESC - name: bytes_sent label: Bytes Sent icon: upload chart: line + bucket: time_bucket + bucket_label: Time query: | - SELECT {bucket} AS bucket, + SELECT {bucket} AS time_bucket, SUM(bytes_sent) AS value FROM logs - GROUP BY bucket - ORDER BY bucket + GROUP BY time_bucket + ORDER BY time_bucket - name: top_paths label: Top Paths icon: map chart: table + top_n: 50 + buckets: + - domain + - path + bucket_label: + - Domain + - Path query: | - SELECT path AS bucket, - COUNT(*) AS value - FROM ( - SELECT substr(substr(request, instr(request, ' ') + 1), 1, + WITH paths AS ( + SELECT host AS domain, + substr(substr(request, instr(request, ' ') + 1), 1, instr(substr(request, instr(request, ' ') + 1), ' ') - 1) AS path FROM logs + ), ranked AS ( + SELECT domain, path, COUNT(*) AS value, + ROW_NUMBER() OVER (PARTITION BY domain ORDER BY COUNT(*) DESC) AS rn + FROM paths + GROUP BY domain, path ) - GROUP BY path - ORDER BY value DESC - LIMIT 20 + SELECT domain, path, value + FROM ranked + WHERE rn <= 20 + ORDER BY domain, value DESC - name: user_agents label: User Agents icon: user chart: table + top_n: 50 + buckets: + - domain + - user_agent + bucket_label: + - Domain + - User Agent query: | - SELECT user_agent AS bucket, - COUNT(*) AS value - FROM logs - GROUP BY user_agent - ORDER BY value DESC - LIMIT 20 + WITH ua AS ( + SELECT host AS domain, user_agent + FROM logs + ), ranked AS ( + SELECT domain, user_agent, COUNT(*) AS value, + ROW_NUMBER() OVER (PARTITION BY domain ORDER BY COUNT(*) DESC) AS rn + FROM ua + GROUP BY domain, user_agent + ) + SELECT domain, user_agent, value + FROM ranked + WHERE rn <= 20 + ORDER BY domain, value DESC - name: referrers label: Referrers icon: link chart: table + top_n: 50 + buckets: + - domain + - referrer + bucket_label: + - Domain + - Referrer query: | - SELECT referer AS bucket, - COUNT(*) AS value - FROM logs - GROUP BY referer - ORDER BY value DESC - LIMIT 20 + WITH ref AS ( + SELECT host AS domain, referer AS referrer + FROM logs + ), ranked AS ( + SELECT domain, referrer, COUNT(*) AS value, + ROW_NUMBER() OVER (PARTITION BY domain ORDER BY COUNT(*) DESC) AS rn + FROM ref + GROUP BY domain, referrer + ) + SELECT domain, referrer, value + FROM ranked + WHERE rn <= 20 + ORDER BY domain, value DESC - name: status_distribution label: HTTP Statuses icon: server chart: pie + bucket: status_group + bucket_label: Status query: | SELECT CASE WHEN status BETWEEN 200 AND 299 THEN '2xx' WHEN status BETWEEN 300 AND 399 THEN '3xx' WHEN status BETWEEN 400 AND 499 THEN '4xx' ELSE '5xx' - END AS bucket, + END AS status_group, COUNT(*) AS value FROM logs - GROUP BY bucket - ORDER BY bucket + GROUP BY status_group + ORDER BY status_group colors: - "#48c78e" - "#209cee" - "#ffdd57" - "#f14668" + +# New time-series: status classes over time (stacked) +- name: status_classes_timeseries + label: Status Classes Over Time + icon: server + chart: stackedBar + bucket: time_bucket + bucket_label: Time + stacked: true + query: | + SELECT {bucket} AS time_bucket, + SUM(CASE WHEN status BETWEEN 200 AND 299 THEN 1 ELSE 0 END) AS "2xx", + SUM(CASE WHEN status BETWEEN 300 AND 399 THEN 1 ELSE 0 END) AS "3xx", + SUM(CASE WHEN status BETWEEN 400 AND 499 THEN 1 ELSE 0 END) AS "4xx", + SUM(CASE WHEN status BETWEEN 500 AND 599 THEN 1 ELSE 0 END) AS "5xx", + COUNT(*) AS total + FROM logs + GROUP BY time_bucket + ORDER BY time_bucket + +# New time-series: cache status over time (compact Hit/Miss; exclude '-' by default) +- name: cache_status_timeseries + label: Cache Status Over Time + icon: archive + chart: stackedBar + bucket: time_bucket + bucket_label: Time + stacked: true + exclude_values: ["-"] + query: | + SELECT {bucket} AS time_bucket, + SUM(CASE WHEN cache_status = 'HIT' THEN 1 ELSE 0 END) AS hit, + SUM(CASE WHEN cache_status = 'MISS' THEN 1 ELSE 0 END) AS miss, + COUNT(*) AS total + FROM logs + GROUP BY time_bucket + ORDER BY time_bucket diff --git a/run-reports.sh b/run-reports.sh index bfe736d..f7cffba 100755 --- a/run-reports.sh +++ b/run-reports.sh @@ -29,21 +29,25 @@ fi # Generate reports for all domains combined echo "[INFO] Generating aggregate reports..." -python scripts/generate_reports.py hourly -python scripts/generate_reports.py daily -python scripts/generate_reports.py weekly -python scripts/generate_reports.py monthly -python scripts/generate_reports.py global +python -m scripts.generate_reports hourly +python -m scripts.generate_reports daily +python -m scripts.generate_reports weekly +python -m scripts.generate_reports monthly +python -m scripts.generate_reports global # Generate reports for each individual domain echo "[INFO] Generating per-domain reports..." -python scripts/generate_reports.py hourly --all-domains -python scripts/generate_reports.py daily --all-domains -python scripts/generate_reports.py weekly --all-domains -python scripts/generate_reports.py monthly --all-domains +python -m scripts.generate_reports hourly --all-domains +python -m scripts.generate_reports daily --all-domains +python -m scripts.generate_reports weekly --all-domains +python -m scripts.generate_reports monthly --all-domains + +# Generate analysis JSON +echo "[INFO] Generating analysis files..." +python -m scripts.generate_reports analysis # Generate root index -python scripts/generate_reports.py index +python -m scripts.generate_reports index # Deactivate to keep cron environment clean if type deactivate >/dev/null 2>&1; then diff --git a/scripts/analyze.py b/scripts/analyze.py index 8ac7c30..9f49978 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -18,7 +18,7 @@ from __future__ import annotations import sqlite3 from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import List, Optional, Set from datetime import datetime, timedelta import json @@ -105,7 +105,9 @@ def hits(domain: Optional[str] = typer.Option(None, help="Filter by domain")) -> @app.command("cache-ratio") -def cache_ratio_cmd(domain: Optional[str] = typer.Option(None, help="Filter by domain")) -> None: +def cache_ratio_cmd( + domain: Optional[str] = typer.Option(None, help="Filter by domain") +) -> None: """Display cache hit ratio as a percentage.""" ratio = get_cache_ratio(domain) * 100 if domain: @@ -115,7 +117,11 @@ def cache_ratio_cmd(domain: Optional[str] = typer.Option(None, help="Filter by d @app.command("check-missing-domains") -def check_missing_domains(json_output: bool = typer.Option(False, "--json", help="Output missing domains as JSON")) -> None: +def check_missing_domains( + json_output: bool = typer.Option( + False, "--json", help="Output missing domains as JSON" + ) +) -> None: """Show domains present in the database but absent from Nginx config.""" try: from scripts.generate_reports import _get_domains as _db_domains @@ -149,12 +155,9 @@ def check_missing_domains(json_output: bool = typer.Option(False, "--json", help typer.echo(d) -@app.command("suggest-cache") def suggest_cache( - threshold: int = typer.Option( - 10, help="Minimum number of MISS entries to report" - ), - json_output: bool = typer.Option(False, "--json", help="Output results as JSON"), + threshold: int = 10, + json_output: bool = False, ) -> None: """Suggest domain/path pairs that could benefit from caching. @@ -187,7 +190,7 @@ def suggest_cache( HAVING miss_count >= ? ORDER BY miss_count DESC """, - (threshold,), + (int(threshold),), ) rows = [r for r in cur.fetchall() if r[0] in no_cache] @@ -207,13 +210,18 @@ def suggest_cache( for item in result: typer.echo(f"{item['host']} {item['path']} {item['misses']}") +@app.command("suggest-cache") +def suggest_cache_cli( + threshold: int = typer.Option(10, help="Minimum number of MISS entries to report"), + json_output: bool = typer.Option(False, "--json", help="Output results as JSON"), +) -> None: + """CLI wrapper for suggest_cache.""" + suggest_cache(threshold=threshold, json_output=json_output) + -@app.command("detect-threats") def detect_threats( - hours: int = typer.Option(1, help="Number of recent hours to analyze"), - ip_threshold: int = typer.Option( - 100, help="Requests from a single IP to flag" - ), + hours: int = 1, + ip_threshold: int = 100, ) -> None: """Detect potential security threats from recent logs.""" @@ -229,8 +237,8 @@ def detect_threats( max_dt = datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") recent_end = max_dt - recent_start = recent_end - timedelta(hours=hours) - prev_start = recent_start - timedelta(hours=hours) + recent_start = recent_end - timedelta(hours=int(hours)) + prev_start = recent_start - timedelta(hours=int(hours)) prev_end = recent_start fmt = "%Y-%m-%d %H:%M:%S" @@ -316,9 +324,7 @@ def detect_threats( """, (recent_start_s, recent_end_s, ip_threshold), ) - high_ip_requests = [ - {"ip": ip, "requests": cnt} for ip, cnt in cur.fetchall() - ] + high_ip_requests = [{"ip": ip, "requests": cnt} for ip, cnt in cur.fetchall()] conn.close() @@ -339,6 +345,14 @@ def detect_threats( out_path.write_text(json.dumps(report, indent=2)) typer.echo(json.dumps(report)) +@app.command("detect-threats") +def detect_threats_cli( + hours: int = typer.Option(1, help="Number of recent hours to analyze"), + ip_threshold: int = typer.Option(100, help="Requests from a single IP to flag"), +) -> None: + """CLI wrapper for detect_threats.""" + detect_threats(hours=hours, ip_threshold=ip_threshold) + if __name__ == "__main__": app() diff --git a/scripts/generate_reports.py b/scripts/generate_reports.py index 4c0c911..d3c2f8a 100644 --- a/scripts/generate_reports.py +++ b/scripts/generate_reports.py @@ -1,18 +1,27 @@ import json +import sys import sqlite3 from pathlib import Path import shutil from typing import List, Dict, Optional +from datetime import datetime, timezone +import time import yaml import typer from jinja2 import Environment, FileSystemLoader +# Ensure project root is importable when running as a script (python scripts/generate_reports.py) +PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + DB_PATH = Path("database/ngxstat.db") OUTPUT_DIR = Path("output") TEMPLATE_DIR = Path("templates") REPORT_CONFIG = Path("reports.yml") +GENERATED_MARKER = OUTPUT_DIR / "generated.txt" # Mapping of interval names to SQLite strftime formats. These strings are # substituted into report queries whenever the special ``{bucket}`` token is @@ -28,6 +37,19 @@ INTERVAL_FORMATS = { app = typer.Typer(help="Generate aggregated log reports") +@app.callback() +def _cli_callback(ctx: typer.Context) -> None: + """Register post-command hook to note generation time.""" + + def _write_marker() -> None: + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + # Use timezone-aware UTC to avoid deprecation warnings and ambiguity + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + GENERATED_MARKER.write_text(f"{timestamp}\n") + + ctx.call_on_close(_write_marker) + + def _get_domains() -> List[str]: """Return a sorted list of unique domains from the logs table.""" conn = sqlite3.connect(DB_PATH) @@ -56,14 +78,17 @@ def _save_json(path: Path, data: List[Dict]) -> None: def _copy_icons() -> None: - """Copy vendored icons to the output directory.""" + """Copy vendored icons and scripts to the output directory.""" src_dir = Path("static/icons") dst_dir = OUTPUT_DIR / "icons" - if not src_dir.is_dir(): - return - dst_dir.mkdir(parents=True, exist_ok=True) - for icon in src_dir.glob("*.svg"): - shutil.copy(icon, dst_dir / icon.name) + if src_dir.is_dir(): + dst_dir.mkdir(parents=True, exist_ok=True) + for icon in src_dir.glob("*.svg"): + shutil.copy(icon, dst_dir / icon.name) + + js_src = Path("static/chartManager.js") + if js_src.is_file(): + shutil.copy(js_src, OUTPUT_DIR / js_src.name) def _render_snippet(report: Dict, out_dir: Path) -> None: @@ -74,7 +99,9 @@ def _render_snippet(report: Dict, out_dir: Path) -> None: snippet_path.write_text(template.render(report=report)) -def _write_stats() -> None: +def _write_stats( + generated_at: Optional[str] = None, generation_seconds: Optional[float] = None +) -> None: """Query basic dataset stats and write them to ``output/global/stats.json``.""" conn = sqlite3.connect(DB_PATH) cur = conn.cursor() @@ -98,6 +125,10 @@ def _write_stats() -> None: "end_date": end_date, "unique_domains": unique_domains, } + if generated_at: + stats["generated_at"] = generated_at + if generation_seconds is not None: + stats["generation_seconds"] = generation_seconds out_path = OUTPUT_DIR / "global" / "stats.json" _save_json(out_path, stats) @@ -154,6 +185,16 @@ def _generate_interval(interval: str, domain: Optional[str] = None) -> None: name = definition["name"] query = definition["query"].replace("{bucket}", bucket) query = query.replace("FROM logs", "FROM logs_view") + # Apply top_n limit for tables (performance-friendly), if configured + top_n = definition.get("top_n") + chart_type = definition.get("chart", "line") + if top_n and chart_type == "table": + try: + n = int(top_n) + if "LIMIT" not in query.upper(): + query = f"{query}\nLIMIT {n}" + except Exception: + pass cur.execute(query) rows = cur.fetchall() headers = [c[0] for c in cur.description] @@ -169,10 +210,28 @@ def _generate_interval(interval: str, domain: Optional[str] = None) -> None: } if "icon" in definition: entry["icon"] = definition["icon"] + if "bucket" in definition: + entry["bucket"] = definition["bucket"] + if "buckets" in definition: + entry["buckets"] = definition["buckets"] + if "bucket_label" in definition: + entry["bucket_label"] = definition["bucket_label"] if "color" in definition: entry["color"] = definition["color"] if "colors" in definition: entry["colors"] = definition["colors"] + # Optional UX metadata passthrough for frontend-only transforms + for key in ( + "windows_supported", + "window_default", + "group_others_threshold", + "exclude_values", + "top_n", + "stacked", + "palette", + ): + if key in definition: + entry[key] = definition[key] _render_snippet(entry, out_dir) report_list.append(entry) @@ -192,12 +251,9 @@ def _generate_all_domains(interval: str) -> None: def _generate_root_index() -> None: """Render the top-level index listing all intervals and domains.""" _copy_icons() - intervals = [ - p.name - for p in OUTPUT_DIR.iterdir() - if p.is_dir() and p.name.lower() not in {"domains", "global", "analysis"} - ] - intervals.sort() + intervals = sorted( + [name for name in INTERVAL_FORMATS if (OUTPUT_DIR / name).is_dir()] + ) domains_dir = OUTPUT_DIR / "domains" domains: List[str] = [] @@ -221,6 +277,10 @@ def _generate_global() -> None: typer.echo("No report definitions found") return + start_time = time.time() + # Use timezone-aware UTC for generated_at (string remains unchanged format) + generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + _copy_icons() conn = sqlite3.connect(DB_PATH) @@ -236,6 +296,16 @@ def _generate_global() -> None: name = definition["name"] query = definition["query"] + # Apply top_n limit for tables (performance-friendly), if configured + top_n = definition.get("top_n") + chart_type = definition.get("chart", "line") + if top_n and chart_type == "table": + try: + n = int(top_n) + if "LIMIT" not in query.upper(): + query = f"{query}\nLIMIT {n}" + except Exception: + pass cur.execute(query) rows = cur.fetchall() headers = [c[0] for c in cur.description] @@ -251,18 +321,65 @@ def _generate_global() -> None: } if "icon" in definition: entry["icon"] = definition["icon"] + if "bucket" in definition: + entry["bucket"] = definition["bucket"] + if "buckets" in definition: + entry["buckets"] = definition["buckets"] + if "bucket_label" in definition: + entry["bucket_label"] = definition["bucket_label"] if "color" in definition: entry["color"] = definition["color"] if "colors" in definition: entry["colors"] = definition["colors"] + # Optional UX metadata passthrough for frontend-only transforms + for key in ( + "windows_supported", + "window_default", + "group_others_threshold", + "exclude_values", + "top_n", + "stacked", + "palette", + ): + if key in definition: + entry[key] = definition[key] _render_snippet(entry, out_dir) report_list.append(entry) _save_json(out_dir / "reports.json", report_list) - _write_stats() + elapsed = round(time.time() - start_time, 2) + _write_stats(generated_at, elapsed) typer.echo("Generated global reports") +def _generate_analysis() -> None: + """Generate analysis JSON files consumed by the Analysis tab.""" + try: + # Import lazily to avoid circulars and keep dependencies optional + from scripts import analyze + except Exception as exc: # pragma: no cover - defensive + typer.echo(f"Failed to import analysis module: {exc}") + return + + # Ensure output root and icons present for parity + _copy_icons() + + # These commands write JSON files under output/analysis/ + try: + analyze.check_missing_domains(json_output=True) + except Exception as exc: # pragma: no cover - continue best-effort + typer.echo(f"check_missing_domains failed: {exc}") + try: + analyze.suggest_cache(json_output=True) + except Exception as exc: # pragma: no cover + typer.echo(f"suggest_cache failed: {exc}") + try: + analyze.detect_threats() + except Exception as exc: # pragma: no cover + typer.echo(f"detect_threats failed: {exc}") + typer.echo("Generated analysis JSON files") + + @app.command() def hourly( domain: Optional[str] = typer.Option( @@ -333,6 +450,12 @@ def global_reports() -> None: _generate_global() +@app.command() +def analysis() -> None: + """Generate analysis JSON files for the Analysis tab.""" + _generate_analysis() + + @app.command() def index() -> None: """Generate the root index page linking all reports.""" diff --git a/scripts/init_db.py b/scripts/init_db.py index f378b5c..b9ea07d 100644 --- a/scripts/init_db.py +++ b/scripts/init_db.py @@ -61,7 +61,9 @@ try: suffix = match.group(1) number = int(suffix.lstrip(".")) if suffix else 0 log_files.append((number, os.path.join(LOG_DIR, f))) - log_files = [path for _, path in sorted(log_files, key=lambda x: x[0], reverse=True)] + log_files = [ + path for _, path in sorted(log_files, key=lambda x: x[0], reverse=True) + ] except FileNotFoundError: print(f"[ERROR] Log directory not found: {LOG_DIR}") exit(1) diff --git a/scripts/nginx_config.py b/scripts/nginx_config.py index dbd635d..bc585a7 100644 --- a/scripts/nginx_config.py +++ b/scripts/nginx_config.py @@ -93,4 +93,3 @@ def parse_servers(paths: Set[Path]) -> List[Dict[str, str]]: entry["root"] = " ".join(directives["root"]) servers.append(entry) return servers - diff --git a/static/chartManager.js b/static/chartManager.js new file mode 100644 index 0000000..2f14f4f --- /dev/null +++ b/static/chartManager.js @@ -0,0 +1,109 @@ +export let currentLoad = null; +const loadInfo = new Map(); + +export function newLoad(container) { + if (currentLoad) { + abortLoad(currentLoad); + } + reset(container); + const controller = new AbortController(); + const token = { controller, charts: new Map() }; + loadInfo.set(token, token); + currentLoad = token; + return token; +} + +export function abortLoad(token) { + const info = loadInfo.get(token); + if (!info) return; + info.controller.abort(); + info.charts.forEach(chart => { + try { + chart.destroy(); + } catch (e) {} + }); + loadInfo.delete(token); + if (currentLoad === token) { + currentLoad = null; + } +} + +export function registerChart(token, id, chart) { + const info = loadInfo.get(token); + if (info) { + info.charts.set(id, chart); + } else { + chart.destroy(); + } +} + +export function reset(container) { + if (!container) return; + container.querySelectorAll('canvas').forEach(c => { + const chart = Chart.getChart(c); + if (chart) { + chart.destroy(); + } + }); + container.innerHTML = ''; +} + +// ---- Lightweight client-side data helpers ---- + +// Slice last N rows from a time-ordered array +export function sliceWindow(data, n) { + if (!Array.isArray(data) || n === undefined || n === null) return data; + if (n === 'all') return data; + const count = Number(n); + if (!Number.isFinite(count) || count <= 0) return data; + return data.slice(-count); +} + +// Exclude rows whose value in key is in excluded list +export function excludeValues(data, key, excluded = []) { + if (!excluded || excluded.length === 0) return data; + const set = new Set(excluded); + return data.filter(row => !set.has(row[key])); +} + +// Compute percentages for categorical distributions (valueKey default 'value') +export function toPercent(data, valueKey = 'value') { + const total = data.reduce((s, r) => s + (Number(r[valueKey]) || 0), 0); + if (total <= 0) return data.map(r => ({ ...r })); + return data.map(r => ({ ...r, [valueKey]: (Number(r[valueKey]) || 0) * 100 / total })); +} + +// Group categories with share < threshold into an 'Other' bucket. +export function groupOthers(data, bucketKey, valueKey = 'value', threshold = 0.03, otherLabel = 'Other') { + if (!Array.isArray(data) || data.length === 0) return data; + const total = data.reduce((s, r) => s + (Number(r[valueKey]) || 0), 0); + if (total <= 0) return data; + const major = []; + let other = 0; + for (const r of data) { + const v = Number(r[valueKey]) || 0; + if (total && v / total < threshold) { + other += v; + } else { + major.push({ ...r }); + } + } + if (other > 0) major.push({ [bucketKey]: otherLabel, [valueKey]: other }); + return major; +} + +// Simple moving average over numeric array +export function movingAverage(series, span = 3) { + const n = Math.max(1, Number(span) || 1); + const out = []; + for (let i = 0; i < series.length; i++) { + const start = Math.max(0, i - n + 1); + let sum = 0, cnt = 0; + for (let j = start; j <= i; j++) { + const v = Number(series[j]); + if (Number.isFinite(v)) { sum += v; cnt++; } + } + out.push(cnt ? sum / cnt : null); + } + return out; +} diff --git a/templates/index.html b/templates/index.html index 237f563..a5de3db 100644 --- a/templates/index.html +++ b/templates/index.html @@ -12,14 +12,15 @@ -
+
+ + + + + +
+ +
-
+
-

Overview

+

Recent

Total logs: -

Date range: - to -

Unique domains: -

+

Last generated: -

+

Generation time: - seconds

+ +
-