Merge pull request #62 from wagesj45/codex/implement-email-message-optimizations

Implement HTML sanitization options
This commit is contained in:
Jordan Wages 2025-07-05 03:47:48 -05:00 committed by GitHub
commit a358bc5703
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 91 additions and 9 deletions

View file

@ -14,6 +14,9 @@
"template.mistral": { "message": "Mistral" }, "template.mistral": { "message": "Mistral" },
"template.custom": { "message": "Custom" }, "template.custom": { "message": "Custom" },
"options.save": { "message": "Save" }, "options.save": { "message": "Save" },
"options.debugLogging": { "message": "Enable debug logging" } "options.debugLogging": { "message": "Enable debug logging" },
,"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" } "options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" },
"options.stripUrlParams": { "message": "Remove URL tracking parameters" },
"options.altTextImages": { "message": "Replace images with alt text" },
"options.collapseWhitespace": { "message": "Collapse long whitespace" }
} }

View file

@ -23,6 +23,9 @@ let timingStats = { count: 0, mean: 0, m2: 0, total: 0, last: -1 };
let currentStart = 0; let currentStart = 0;
let logGetTiming = true; let logGetTiming = true;
let htmlToMarkdown = false; let htmlToMarkdown = false;
let stripUrlParams = false;
let altTextImages = false;
let collapseWhitespace = false;
let TurndownService = null; let TurndownService = null;
function setIcon(path) { function setIcon(path) {
@ -58,6 +61,20 @@ function replaceInlineBase64(text) {
m => `[base64: ${byteSize(m)} bytes]`); m => `[base64: ${byteSize(m)} bytes]`);
} }
function sanitizeString(text) {
let t = String(text);
if (stripUrlParams) {
t = t.replace(/https?:\/\/[^\s)]+/g, m => {
const idx = m.indexOf('?');
return idx >= 0 ? m.slice(0, idx) : m;
});
}
if (collapseWhitespace) {
t = t.replace(/[ \t\u00A0]{2,}/g, ' ').replace(/\n{3,}/g, '\n\n');
}
return t;
}
function collectText(part, bodyParts, attachments) { function collectText(part, bodyParts, attachments) {
if (part.parts && part.parts.length) { if (part.parts && part.parts.length) {
for (const p of part.parts) collectText(p, bodyParts, attachments); for (const p of part.parts) collectText(p, bodyParts, attachments);
@ -72,19 +89,35 @@ function collectText(part, bodyParts, attachments) {
attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`); attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`);
} else if (ct.startsWith("text/html")) { } else if (ct.startsWith("text/html")) {
const doc = new DOMParser().parseFromString(body, 'text/html'); const doc = new DOMParser().parseFromString(body, 'text/html');
if (altTextImages) {
doc.querySelectorAll('img').forEach(img => {
const alt = img.getAttribute('alt') || '';
img.replaceWith(doc.createTextNode(alt));
});
}
if (stripUrlParams) {
doc.querySelectorAll('[href]').forEach(a => {
const href = a.getAttribute('href');
if (href) a.setAttribute('href', href.split('?')[0]);
});
doc.querySelectorAll('[src]').forEach(e => {
const src = e.getAttribute('src');
if (src) e.setAttribute('src', src.split('?')[0]);
});
}
if (htmlToMarkdown && TurndownService) { if (htmlToMarkdown && TurndownService) {
try { try {
const td = new TurndownService(); const td = new TurndownService();
const md = td.turndown(doc.body.innerHTML || body); const md = sanitizeString(td.turndown(doc.body.innerHTML || body));
bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`)); bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`));
} catch (e) { } catch (e) {
bodyParts.push(replaceInlineBase64(doc.body.textContent || "")); bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
} }
} else { } else {
bodyParts.push(replaceInlineBase64(doc.body.textContent || "")); bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
} }
} else { } else {
bodyParts.push(replaceInlineBase64(body)); bodyParts.push(replaceInlineBase64(sanitizeString(body)));
} }
} }
@ -96,7 +129,8 @@ function buildEmailText(full) {
.map(([k,v]) => `${k}: ${v.join(' ')}`) .map(([k,v]) => `${k}: ${v.join(' ')}`)
.join('\n'); .join('\n');
const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : ""); const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : "");
return `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim(); const combined = `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
return sanitizeString(combined);
} }
async function applyAiRules(idsInput) { async function applyAiRules(idsInput) {
const ids = Array.isArray(idsInput) ? idsInput : [idsInput]; const ids = Array.isArray(idsInput) ? idsInput : [idsInput];
@ -233,11 +267,14 @@ async function clearCacheForMessages(idsInput) {
} }
try { try {
const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "aiRules"]); const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "stripUrlParams", "altTextImages", "collapseWhitespace", "aiRules"]);
logger.setDebug(store.debugLogging); logger.setDebug(store.debugLogging);
await AiClassifier.setConfig(store); await AiClassifier.setConfig(store);
await AiClassifier.init(); await AiClassifier.init();
htmlToMarkdown = store.htmlToMarkdown === true; htmlToMarkdown = store.htmlToMarkdown === true;
stripUrlParams = store.stripUrlParams === true;
altTextImages = store.altTextImages === true;
collapseWhitespace = store.collapseWhitespace === true;
const savedStats = await storage.local.get('classifyStats'); const savedStats = await storage.local.get('classifyStats');
if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') { if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') {
Object.assign(timingStats, savedStats.classifyStats); Object.assign(timingStats, savedStats.classifyStats);
@ -273,6 +310,18 @@ async function clearCacheForMessages(idsInput) {
htmlToMarkdown = changes.htmlToMarkdown.newValue === true; htmlToMarkdown = changes.htmlToMarkdown.newValue === true;
logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown); logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown);
} }
if (changes.stripUrlParams) {
stripUrlParams = changes.stripUrlParams.newValue === true;
logger.aiLog("stripUrlParams updated from storage change", {debug: true}, stripUrlParams);
}
if (changes.altTextImages) {
altTextImages = changes.altTextImages.newValue === true;
logger.aiLog("altTextImages updated from storage change", {debug: true}, altTextImages);
}
if (changes.collapseWhitespace) {
collapseWhitespace = changes.collapseWhitespace.newValue === true;
logger.aiLog("collapseWhitespace updated from storage change", {debug: true}, collapseWhitespace);
}
}); });
} catch (err) { } catch (err) {
logger.aiLog("failed to load config", {level: 'error'}, err); logger.aiLog("failed to load config", {level: 'error'}, err);

View file

@ -103,6 +103,21 @@
<input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown <input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown
</label> </label>
</div> </div>
<div class="field">
<label class="checkbox">
<input type="checkbox" id="strip-url-params"> Remove URL tracking parameters
</label>
</div>
<div class="field">
<label class="checkbox">
<input type="checkbox" id="alt-text-images"> Replace images with alt text
</label>
</div>
<div class="field">
<label class="checkbox">
<input type="checkbox" id="collapse-whitespace"> Collapse long whitespace
</label>
</div>
<div class="field"> <div class="field">
<label class="label" for="max_tokens">Max tokens</label> <label class="label" for="max_tokens">Max tokens</label>
<div class="control"> <div class="control">

View file

@ -10,6 +10,9 @@ document.addEventListener('DOMContentLoaded', async () => {
'aiParams', 'aiParams',
'debugLogging', 'debugLogging',
'htmlToMarkdown', 'htmlToMarkdown',
'stripUrlParams',
'altTextImages',
'collapseWhitespace',
'aiRules', 'aiRules',
'aiCache' 'aiCache'
]); ]);
@ -85,6 +88,15 @@ document.addEventListener('DOMContentLoaded', async () => {
const htmlToggle = document.getElementById('html-to-markdown'); const htmlToggle = document.getElementById('html-to-markdown');
htmlToggle.checked = defaults.htmlToMarkdown === true; htmlToggle.checked = defaults.htmlToMarkdown === true;
const stripUrlToggle = document.getElementById('strip-url-params');
stripUrlToggle.checked = defaults.stripUrlParams === true;
const altTextToggle = document.getElementById('alt-text-images');
altTextToggle.checked = defaults.altTextImages === true;
const collapseWhitespaceToggle = document.getElementById('collapse-whitespace');
collapseWhitespaceToggle.checked = defaults.collapseWhitespace === true;
const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {}); const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {});
for (const [key, val] of Object.entries(aiParams)) { for (const [key, val] of Object.entries(aiParams)) {
const el = document.getElementById(key); const el = document.getElementById(key);
@ -418,7 +430,10 @@ document.addEventListener('DOMContentLoaded', async () => {
const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked; const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked;
return { criterion, actions, stopProcessing }; return { criterion, actions, stopProcessing };
}).filter(r => r.criterion); }).filter(r => r.criterion);
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, aiRules: rules }); const stripUrlParams = stripUrlToggle.checked;
const altTextImages = altTextToggle.checked;
const collapseWhitespace = collapseWhitespaceToggle.checked;
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, stripUrlParams, altTextImages, collapseWhitespace, aiRules: rules });
try { try {
await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging }); await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging });
logger.setDebug(debugLogging); logger.setDebug(debugLogging);