Parse full message for AI classification

2025-06-26 17:02:10 -05:00 · 2025-06-26 17:02:10 -05:00 · 1070610174
commit 1070610174
parent 93a36ea2d8
2 changed files with 57 additions and 4 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@ -43,3 +43,18 @@ Additional documentation exists outside this repository.
  - [Bulma.css](https://github.com/jgthms/bulma)
 - Issue tracker: [Thunderbird tracker on Bugzilla](https://bugzilla.mozilla.org/describecomponents.cgi?product=Thunderbird)

+
+### Message Structure Notes
+
+Messages retrieved with `messenger.messages.getFull` are returned as
+nested objects. The root contains `headers` and a `parts` array. Each part may
+itself contain `parts` for multipart messages or a `body` string. Attachments are
+indicated via the `content-disposition` header.
+
+When constructing the text sent to the AI service, parse the full message
+recursively. Include key headers such as `from`, `to`, `subject`, and others, and
+record attachment summaries rather than raw binary data. Inline or attached
+base64 data should be replaced with placeholders showing the byte size. The
+final string should have the headers, a brief attachment section, then the plain
+text extracted from all text parts.
+
--- a/background.js
+++ b/background.js
@ -39,6 +39,45 @@ async function sha256Hex(str) {
    return Array.from(new Uint8Array(buf), b => b.toString(16).padStart(2, '0')).join('');
 }

+function byteSize(str) {
+    return new TextEncoder().encode(str || "").length;
+}
+
+function replaceInlineBase64(text) {
+    return text.replace(/[A-Za-z0-9+/]{100,}={0,2}/g,
+        m => `[base64: ${byteSize(m)} bytes]`);
+}
+
+function collectText(part, bodyParts, attachments) {
+    if (part.parts && part.parts.length) {
+        for (const p of part.parts) collectText(p, bodyParts, attachments);
+        return;
+    }
+    const ct = (part.contentType || "text/plain").toLowerCase();
+    const cd = (part.headers?.["content-disposition"]?.[0] || "").toLowerCase();
+    const body = String(part.body || "");
+    if (cd.includes("attachment") || !ct.startsWith("text/")) {
+        const nameMatch = /filename\s*=\s*"?([^";]+)/i.exec(cd) || /name\s*=\s*"?([^";]+)/i.exec(part.headers?.["content-type"]?.[0] || "");
+        const name = nameMatch ? nameMatch[1] : "";
+        attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`);
+    } else if (ct.startsWith("text/html")) {
+        const doc = new DOMParser().parseFromString(body, 'text/html');
+        bodyParts.push(replaceInlineBase64(doc.body.textContent || ""));
+    } else {
+        bodyParts.push(replaceInlineBase64(body));
+    }
+}
+
+function buildEmailText(full) {
+    const bodyParts = [];
+    const attachments = [];
+    collectText(full, bodyParts, attachments);
+    const headers = Object.entries(full.headers || {})
+        .map(([k,v]) => `${k}: ${v.join(' ')}`)
+        .join('\n');
+    const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : "");
+    return `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
+}
 async function applyAiRules(idsInput) {
    const ids = Array.isArray(idsInput) ? idsInput : [idsInput];
    if (!ids.length) return queue;
@ -66,8 +105,7 @@ async function applyAiRules(idsInput) {
            updateActionIcon();
            try {
                const full = await messenger.messages.getFull(id);
-                const text = full?.parts?.[0]?.body || "";
-                for (const rule of aiRules) {
+                const text = buildEmailText(full);
                const cacheKey = await sha256Hex(`${id}|${rule.criterion}`);
                const matched = await AiClassifier.classifyText(text, rule.criterion, cacheKey);
                    if (matched) {