From 606e612ab0b5d79ee166c59b29383b5305af8fd3 Mon Sep 17 00:00:00 2001 From: Jordan Wages Date: Tue, 24 Jun 2025 14:05:03 -0500 Subject: [PATCH] Refactor getPlainText into shared module --- modules/ExpressionSearchFilter.jsm | 85 +--------------------------- modules/messageUtils.jsm | 89 ++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 84 deletions(-) create mode 100644 modules/messageUtils.jsm diff --git a/modules/ExpressionSearchFilter.jsm b/modules/ExpressionSearchFilter.jsm index 2403b01..c50dd59 100644 --- a/modules/ExpressionSearchFilter.jsm +++ b/modules/ExpressionSearchFilter.jsm @@ -2,10 +2,9 @@ var { ExtensionParent } = ChromeUtils.importESModule("resource://gre/modules/ExtensionParent.sys.mjs"); var { MailServices } = ChromeUtils.importESModule("resource:///modules/MailServices.sys.mjs"); var { Services } = globalThis || ChromeUtils.importESModule("resource://gre/modules/Services.sys.mjs"); -var { NetUtil } = ChromeUtils.importESModule("resource://gre/modules/NetUtil.sys.mjs"); -var { MimeParser } = ChromeUtils.importESModule("resource:///modules/mimeParser.sys.mjs"); var { aiLog } = ChromeUtils.import("resource://aifilter/modules/logger.jsm"); var { AiClassifier } = ChromeUtils.import("resource://aifilter/modules/AiClassifier.jsm"); +var { getPlainText } = ChromeUtils.import("resource://aifilter/modules/messageUtils.jsm"); function sha256Hex(str) { const hasher = Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash); @@ -58,88 +57,6 @@ class CustomerTermBase { } } -function getPlainText(msgHdr) { - aiLog(`[ExpressionSearchFilter] Extracting plain text for message ID ${msgHdr.messageId}`, {debug: true}); - let folder = msgHdr.folder; - if (!folder.getMsgInputStream) return ""; - let reusable = {}; - let stream = folder.getMsgInputStream(msgHdr, reusable); - let data = NetUtil.readInputStreamToString(stream, msgHdr.messageSize); - if (!reusable.value) stream.close(); - - let parser = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils); - - try { - let root = MimeParser.parseSync(data, {strformat: "unicode"}); - let parts = []; - - function pushPlaceholder(type, info, bytes) { - bytes = bytes || 0; - let prettyType = type.split("/")[1] || type; - parts.push(`[${info}: ${prettyType}, ${bytes} bytes]`); - } - - function byteSizeFromBase64(str) { - let clean = str.replace(/[^A-Za-z0-9+/=]/g, ""); - return Math.floor(clean.length * 3 / 4); - } - - function replaceInlineBase64(text) { - return text.replace(/[A-Za-z0-9+/]{100,}={0,2}/g, - m => `[base64: ${byteSizeFromBase64(m)} bytes]`); - } - - function walk(node) { - if (node.parts && node.parts.length) { - for (let child of node.parts) { - walk(child); - } - return; - } - - let ct = (node.contentType || "text/plain").toLowerCase(); - let cd = (node.headers?.["content-disposition"]?.[0] || "").toLowerCase(); - let enc = (node.headers?.["content-transfer-encoding"]?.[0] || "").toLowerCase(); - let bodyText = String(node.body || ""); - - if (cd.includes("attachment")) { - pushPlaceholder(ct, "binary attachment", byteSizeFromBase64(bodyText)); - } else if (ct.startsWith("text/plain")) { - if (enc === "base64") { - parts.push(`[base64: ${byteSizeFromBase64(bodyText)} bytes]`); - } else { - parts.push(replaceInlineBase64(bodyText)); - } - } else if (ct.startsWith("text/html")) { - if (enc === "base64") { - parts.push(`[base64: ${byteSizeFromBase64(bodyText)} bytes]`); - } else { - let txt = parser.convertToPlainText(bodyText, - Ci.nsIDocumentEncoder.OutputLFLineBreak | - Ci.nsIDocumentEncoder.OutputNoScriptContent | - Ci.nsIDocumentEncoder.OutputNoFramesContent | - Ci.nsIDocumentEncoder.OutputBodyOnly, 0); - parts.push(replaceInlineBase64(txt)); - } - } else { - // Other single part types treated as attachments - pushPlaceholder(ct, "binary attachment", byteSizeFromBase64(bodyText)); - } - } - - walk(root); - return parts.join("\n"); - } catch (e) { - // Fallback: convert entire raw message to text - aiLog(`Failed to parse MIME, falling back to raw conversion`, {level: 'warn'}, e); - return parser.convertToPlainText(data, - Ci.nsIDocumentEncoder.OutputLFLineBreak | - Ci.nsIDocumentEncoder.OutputNoScriptContent | - Ci.nsIDocumentEncoder.OutputNoFramesContent | - Ci.nsIDocumentEncoder.OutputBodyOnly, 0); - } -} - class ClassificationTerm extends CustomerTermBase { constructor() { diff --git a/modules/messageUtils.jsm b/modules/messageUtils.jsm new file mode 100644 index 0000000..a4978a7 --- /dev/null +++ b/modules/messageUtils.jsm @@ -0,0 +1,89 @@ +"use strict"; +var { NetUtil } = ChromeUtils.importESModule("resource://gre/modules/NetUtil.sys.mjs"); +var { MimeParser } = ChromeUtils.importESModule("resource:///modules/mimeParser.sys.mjs"); +var { aiLog } = ChromeUtils.import("resource://aifilter/modules/logger.jsm"); + +var EXPORTED_SYMBOLS = ["getPlainText"]; + +function getPlainText(msgHdr) { + aiLog(`[ExpressionSearchFilter] Extracting plain text for message ID ${msgHdr.messageId}`, {debug: true}); + let folder = msgHdr.folder; + if (!folder.getMsgInputStream) return ""; + let reusable = {}; + let stream = folder.getMsgInputStream(msgHdr, reusable); + let data = NetUtil.readInputStreamToString(stream, msgHdr.messageSize); + if (!reusable.value) stream.close(); + + let parser = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils); + + try { + let root = MimeParser.parseSync(data, {strformat: "unicode"}); + let parts = []; + + function pushPlaceholder(type, info, bytes) { + bytes = bytes || 0; + let prettyType = type.split("/")[1] || type; + parts.push(`[${info}: ${prettyType}, ${bytes} bytes]`); + } + + function byteSizeFromBase64(str) { + let clean = str.replace(/[^A-Za-z0-9+/=]/g, ""); + return Math.floor(clean.length * 3 / 4); + } + + function replaceInlineBase64(text) { + return text.replace(/[A-Za-z0-9+/]{100,}={0,2}/g, + m => `[base64: ${byteSizeFromBase64(m)} bytes]`); + } + + function walk(node) { + if (node.parts && node.parts.length) { + for (let child of node.parts) { + walk(child); + } + return; + } + + let ct = (node.contentType || "text/plain").toLowerCase(); + let cd = (node.headers?.["content-disposition"]?.[0] || "").toLowerCase(); + let enc = (node.headers?.["content-transfer-encoding"]?.[0] || "").toLowerCase(); + let bodyText = String(node.body || ""); + + if (cd.includes("attachment")) { + pushPlaceholder(ct, "binary attachment", byteSizeFromBase64(bodyText)); + } else if (ct.startsWith("text/plain")) { + if (enc === "base64") { + parts.push(`[base64: ${byteSizeFromBase64(bodyText)} bytes]`); + } else { + parts.push(replaceInlineBase64(bodyText)); + } + } else if (ct.startsWith("text/html")) { + if (enc === "base64") { + parts.push(`[base64: ${byteSizeFromBase64(bodyText)} bytes]`); + } else { + let txt = parser.convertToPlainText(bodyText, + Ci.nsIDocumentEncoder.OutputLFLineBreak | + Ci.nsIDocumentEncoder.OutputNoScriptContent | + Ci.nsIDocumentEncoder.OutputNoFramesContent | + Ci.nsIDocumentEncoder.OutputBodyOnly, 0); + parts.push(replaceInlineBase64(txt)); + } + } else { + // Other single part types treated as attachments + pushPlaceholder(ct, "binary attachment", byteSizeFromBase64(bodyText)); + } + } + + walk(root); + return parts.join("\n"); + } catch (e) { + // Fallback: convert entire raw message to text + aiLog(`Failed to parse MIME, falling back to raw conversion`, {level: 'warn'}, e); + return parser.convertToPlainText(data, + Ci.nsIDocumentEncoder.OutputLFLineBreak | + Ci.nsIDocumentEncoder.OutputNoScriptContent | + Ci.nsIDocumentEncoder.OutputNoFramesContent | + Ci.nsIDocumentEncoder.OutputBodyOnly, 0); + } +} +