Merge pull request #62 from wagesj45/codex/implement-email-message-optimizations
Implement HTML sanitization options
This commit is contained in:
commit
a358bc5703
4 changed files with 91 additions and 9 deletions
|
@ -14,6 +14,9 @@
|
||||||
"template.mistral": { "message": "Mistral" },
|
"template.mistral": { "message": "Mistral" },
|
||||||
"template.custom": { "message": "Custom" },
|
"template.custom": { "message": "Custom" },
|
||||||
"options.save": { "message": "Save" },
|
"options.save": { "message": "Save" },
|
||||||
"options.debugLogging": { "message": "Enable debug logging" }
|
"options.debugLogging": { "message": "Enable debug logging" },
|
||||||
,"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" }
|
"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" },
|
||||||
|
"options.stripUrlParams": { "message": "Remove URL tracking parameters" },
|
||||||
|
"options.altTextImages": { "message": "Replace images with alt text" },
|
||||||
|
"options.collapseWhitespace": { "message": "Collapse long whitespace" }
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,9 @@ let timingStats = { count: 0, mean: 0, m2: 0, total: 0, last: -1 };
|
||||||
let currentStart = 0;
|
let currentStart = 0;
|
||||||
let logGetTiming = true;
|
let logGetTiming = true;
|
||||||
let htmlToMarkdown = false;
|
let htmlToMarkdown = false;
|
||||||
|
let stripUrlParams = false;
|
||||||
|
let altTextImages = false;
|
||||||
|
let collapseWhitespace = false;
|
||||||
let TurndownService = null;
|
let TurndownService = null;
|
||||||
|
|
||||||
function setIcon(path) {
|
function setIcon(path) {
|
||||||
|
@ -58,6 +61,20 @@ function replaceInlineBase64(text) {
|
||||||
m => `[base64: ${byteSize(m)} bytes]`);
|
m => `[base64: ${byteSize(m)} bytes]`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function sanitizeString(text) {
|
||||||
|
let t = String(text);
|
||||||
|
if (stripUrlParams) {
|
||||||
|
t = t.replace(/https?:\/\/[^\s)]+/g, m => {
|
||||||
|
const idx = m.indexOf('?');
|
||||||
|
return idx >= 0 ? m.slice(0, idx) : m;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (collapseWhitespace) {
|
||||||
|
t = t.replace(/[ \t\u00A0]{2,}/g, ' ').replace(/\n{3,}/g, '\n\n');
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
function collectText(part, bodyParts, attachments) {
|
function collectText(part, bodyParts, attachments) {
|
||||||
if (part.parts && part.parts.length) {
|
if (part.parts && part.parts.length) {
|
||||||
for (const p of part.parts) collectText(p, bodyParts, attachments);
|
for (const p of part.parts) collectText(p, bodyParts, attachments);
|
||||||
|
@ -72,19 +89,35 @@ function collectText(part, bodyParts, attachments) {
|
||||||
attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`);
|
attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`);
|
||||||
} else if (ct.startsWith("text/html")) {
|
} else if (ct.startsWith("text/html")) {
|
||||||
const doc = new DOMParser().parseFromString(body, 'text/html');
|
const doc = new DOMParser().parseFromString(body, 'text/html');
|
||||||
|
if (altTextImages) {
|
||||||
|
doc.querySelectorAll('img').forEach(img => {
|
||||||
|
const alt = img.getAttribute('alt') || '';
|
||||||
|
img.replaceWith(doc.createTextNode(alt));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (stripUrlParams) {
|
||||||
|
doc.querySelectorAll('[href]').forEach(a => {
|
||||||
|
const href = a.getAttribute('href');
|
||||||
|
if (href) a.setAttribute('href', href.split('?')[0]);
|
||||||
|
});
|
||||||
|
doc.querySelectorAll('[src]').forEach(e => {
|
||||||
|
const src = e.getAttribute('src');
|
||||||
|
if (src) e.setAttribute('src', src.split('?')[0]);
|
||||||
|
});
|
||||||
|
}
|
||||||
if (htmlToMarkdown && TurndownService) {
|
if (htmlToMarkdown && TurndownService) {
|
||||||
try {
|
try {
|
||||||
const td = new TurndownService();
|
const td = new TurndownService();
|
||||||
const md = td.turndown(doc.body.innerHTML || body);
|
const md = sanitizeString(td.turndown(doc.body.innerHTML || body));
|
||||||
bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`));
|
bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
bodyParts.push(replaceInlineBase64(doc.body.textContent || ""));
|
bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bodyParts.push(replaceInlineBase64(doc.body.textContent || ""));
|
bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bodyParts.push(replaceInlineBase64(body));
|
bodyParts.push(replaceInlineBase64(sanitizeString(body)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,7 +129,8 @@ function buildEmailText(full) {
|
||||||
.map(([k,v]) => `${k}: ${v.join(' ')}`)
|
.map(([k,v]) => `${k}: ${v.join(' ')}`)
|
||||||
.join('\n');
|
.join('\n');
|
||||||
const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : "");
|
const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : "");
|
||||||
return `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
|
const combined = `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
|
||||||
|
return sanitizeString(combined);
|
||||||
}
|
}
|
||||||
async function applyAiRules(idsInput) {
|
async function applyAiRules(idsInput) {
|
||||||
const ids = Array.isArray(idsInput) ? idsInput : [idsInput];
|
const ids = Array.isArray(idsInput) ? idsInput : [idsInput];
|
||||||
|
@ -233,11 +267,14 @@ async function clearCacheForMessages(idsInput) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "aiRules"]);
|
const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "stripUrlParams", "altTextImages", "collapseWhitespace", "aiRules"]);
|
||||||
logger.setDebug(store.debugLogging);
|
logger.setDebug(store.debugLogging);
|
||||||
await AiClassifier.setConfig(store);
|
await AiClassifier.setConfig(store);
|
||||||
await AiClassifier.init();
|
await AiClassifier.init();
|
||||||
htmlToMarkdown = store.htmlToMarkdown === true;
|
htmlToMarkdown = store.htmlToMarkdown === true;
|
||||||
|
stripUrlParams = store.stripUrlParams === true;
|
||||||
|
altTextImages = store.altTextImages === true;
|
||||||
|
collapseWhitespace = store.collapseWhitespace === true;
|
||||||
const savedStats = await storage.local.get('classifyStats');
|
const savedStats = await storage.local.get('classifyStats');
|
||||||
if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') {
|
if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') {
|
||||||
Object.assign(timingStats, savedStats.classifyStats);
|
Object.assign(timingStats, savedStats.classifyStats);
|
||||||
|
@ -273,6 +310,18 @@ async function clearCacheForMessages(idsInput) {
|
||||||
htmlToMarkdown = changes.htmlToMarkdown.newValue === true;
|
htmlToMarkdown = changes.htmlToMarkdown.newValue === true;
|
||||||
logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown);
|
logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown);
|
||||||
}
|
}
|
||||||
|
if (changes.stripUrlParams) {
|
||||||
|
stripUrlParams = changes.stripUrlParams.newValue === true;
|
||||||
|
logger.aiLog("stripUrlParams updated from storage change", {debug: true}, stripUrlParams);
|
||||||
|
}
|
||||||
|
if (changes.altTextImages) {
|
||||||
|
altTextImages = changes.altTextImages.newValue === true;
|
||||||
|
logger.aiLog("altTextImages updated from storage change", {debug: true}, altTextImages);
|
||||||
|
}
|
||||||
|
if (changes.collapseWhitespace) {
|
||||||
|
collapseWhitespace = changes.collapseWhitespace.newValue === true;
|
||||||
|
logger.aiLog("collapseWhitespace updated from storage change", {debug: true}, collapseWhitespace);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
logger.aiLog("failed to load config", {level: 'error'}, err);
|
logger.aiLog("failed to load config", {level: 'error'}, err);
|
||||||
|
|
|
@ -103,6 +103,21 @@
|
||||||
<input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown
|
<input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="field">
|
||||||
|
<label class="checkbox">
|
||||||
|
<input type="checkbox" id="strip-url-params"> Remove URL tracking parameters
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div class="field">
|
||||||
|
<label class="checkbox">
|
||||||
|
<input type="checkbox" id="alt-text-images"> Replace images with alt text
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div class="field">
|
||||||
|
<label class="checkbox">
|
||||||
|
<input type="checkbox" id="collapse-whitespace"> Collapse long whitespace
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
<div class="field">
|
<div class="field">
|
||||||
<label class="label" for="max_tokens">Max tokens</label>
|
<label class="label" for="max_tokens">Max tokens</label>
|
||||||
<div class="control">
|
<div class="control">
|
||||||
|
|
|
@ -10,6 +10,9 @@ document.addEventListener('DOMContentLoaded', async () => {
|
||||||
'aiParams',
|
'aiParams',
|
||||||
'debugLogging',
|
'debugLogging',
|
||||||
'htmlToMarkdown',
|
'htmlToMarkdown',
|
||||||
|
'stripUrlParams',
|
||||||
|
'altTextImages',
|
||||||
|
'collapseWhitespace',
|
||||||
'aiRules',
|
'aiRules',
|
||||||
'aiCache'
|
'aiCache'
|
||||||
]);
|
]);
|
||||||
|
@ -85,6 +88,15 @@ document.addEventListener('DOMContentLoaded', async () => {
|
||||||
const htmlToggle = document.getElementById('html-to-markdown');
|
const htmlToggle = document.getElementById('html-to-markdown');
|
||||||
htmlToggle.checked = defaults.htmlToMarkdown === true;
|
htmlToggle.checked = defaults.htmlToMarkdown === true;
|
||||||
|
|
||||||
|
const stripUrlToggle = document.getElementById('strip-url-params');
|
||||||
|
stripUrlToggle.checked = defaults.stripUrlParams === true;
|
||||||
|
|
||||||
|
const altTextToggle = document.getElementById('alt-text-images');
|
||||||
|
altTextToggle.checked = defaults.altTextImages === true;
|
||||||
|
|
||||||
|
const collapseWhitespaceToggle = document.getElementById('collapse-whitespace');
|
||||||
|
collapseWhitespaceToggle.checked = defaults.collapseWhitespace === true;
|
||||||
|
|
||||||
const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {});
|
const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {});
|
||||||
for (const [key, val] of Object.entries(aiParams)) {
|
for (const [key, val] of Object.entries(aiParams)) {
|
||||||
const el = document.getElementById(key);
|
const el = document.getElementById(key);
|
||||||
|
@ -418,7 +430,10 @@ document.addEventListener('DOMContentLoaded', async () => {
|
||||||
const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked;
|
const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked;
|
||||||
return { criterion, actions, stopProcessing };
|
return { criterion, actions, stopProcessing };
|
||||||
}).filter(r => r.criterion);
|
}).filter(r => r.criterion);
|
||||||
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, aiRules: rules });
|
const stripUrlParams = stripUrlToggle.checked;
|
||||||
|
const altTextImages = altTextToggle.checked;
|
||||||
|
const collapseWhitespace = collapseWhitespaceToggle.checked;
|
||||||
|
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, stripUrlParams, altTextImages, collapseWhitespace, aiRules: rules });
|
||||||
try {
|
try {
|
||||||
await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging });
|
await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging });
|
||||||
logger.setDebug(debugLogging);
|
logger.setDebug(debugLogging);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue