wagenet-ip-ban-list/scripts/banlist_metrics.py
codex-bot 2cc881199f
All checks were successful
Generate banlist history graph / build (push) Successful in 6s
viz: format x-axis ticks with full date (YYYY-MM-DD)
2025-08-26 23:16:34 -05:00

169 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Generate historical metrics for banned.txt by walking git history.
Outputs:
- CSV with (date_iso, timestamp, commit, count)
- Optional SVG/PNG line chart of counts over time
Usage:
python scripts/banlist_metrics.py \
--file banned.txt \
--csv metrics/banlist_counts.csv \
--image assets/banlist_history.svg
"""
from __future__ import annotations
import argparse
import csv
import os
import subprocess
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import List, Optional
@dataclass
class Snapshot:
commit: str
timestamp: int # Unix epoch seconds
count: int
@property
def date_iso(self) -> str:
return datetime.fromtimestamp(self.timestamp, tz=timezone.utc).isoformat()
def run(cmd: List[str], cwd: Optional[str] = None) -> str:
out = subprocess.check_output(cmd, cwd=cwd)
return out.decode().strip()
def git_file_commits(path: str) -> List[str]:
# List commits (oldest -> newest) that touched the file
revs = run(["git", "rev-list", "--reverse", "HEAD", "--", path])
return [r for r in revs.splitlines() if r]
def git_commit_timestamp(commit: str) -> int:
return int(run(["git", "show", "-s", "--format=%ct", commit]))
def git_show_file_at(commit: str, path: str) -> str:
return run(["git", "show", f"{commit}:{path}"])
def count_ips(text: str) -> int:
count = 0
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
count += 1
return count
def collect_snapshots(target_file: str) -> List[Snapshot]:
commits = git_file_commits(target_file)
snaps: List[Snapshot] = []
for c in commits:
try:
content = git_show_file_at(c, target_file)
except subprocess.CalledProcessError:
# File may not exist in this commit (renames, etc.)
continue
cnt = count_ips(content)
ts = git_commit_timestamp(c)
snaps.append(Snapshot(commit=c, timestamp=ts, count=cnt))
return snaps
def write_csv(snaps: List[Snapshot], csv_path: str) -> None:
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
with open(csv_path, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(["date_iso", "timestamp", "commit", "count"]) # header
for s in snaps:
w.writerow([s.date_iso, s.timestamp, s.commit, s.count])
def write_chart(snaps: List[Snapshot], image_path: str) -> None:
try:
import matplotlib
matplotlib.use("Agg") # headless
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
except Exception as e:
print(f"Skipping chart generation (matplotlib unavailable): {e}")
return
if not snaps:
print("No snapshots to chart.")
return
xs = [datetime.fromtimestamp(s.timestamp, tz=timezone.utc) for s in snaps]
ys = [s.count for s in snaps]
fig, ax = plt.subplots(figsize=(9.5, 3.2))
ax.plot(xs, ys, marker="o", linewidth=1.5, markersize=2)
ax.set_title("WageNet IP Ban List Size Over Time")
ax.set_xlabel("Date (UTC)")
ax.set_ylabel("IP count")
ax.grid(True, linestyle=":", linewidth=0.5)
# Use full date labels on major ticks
try:
locator = mdates.AutoDateLocator(minticks=4, maxticks=8)
formatter = mdates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
except Exception:
pass
# Always rotate for readability regardless of formatter availability
try:
fig.autofmt_xdate(rotation=45, ha="right")
except Exception:
for label in ax.get_xticklabels():
label.set_rotation(45)
label.set_horizontalalignment("right")
# Add slight horizontal margins to prevent clipping at edges
ax.margins(x=0.02)
fig.tight_layout()
os.makedirs(os.path.dirname(image_path), exist_ok=True)
fig.savefig(image_path)
plt.close(fig)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--file", default="banned.txt", help="Path to ban list in repo")
ap.add_argument("--csv", default="metrics/banlist_counts.csv", help="Output CSV path")
ap.add_argument(
"--image",
default="assets/banlist_history.svg",
help="Output image path (SVG/PNG)",
)
args = ap.parse_args()
# Ensure we are in a git repo
try:
run(["git", "rev-parse", "--is-inside-work-tree"])
except subprocess.CalledProcessError:
raise SystemExit("This script must run inside a git repository.")
snaps = collect_snapshots(args.file)
write_csv(snaps, args.csv)
if args.image:
write_chart(snaps, args.image)
print(f"Wrote {len(snaps)} snapshots to {args.csv}")
if args.image:
print(f"Updated chart at {args.image}")
return 0
if __name__ == "__main__":
raise SystemExit(main())