Add HTML sanitization options
This commit is contained in:
parent
b160f2221e
commit
149ff03cf9
4 changed files with 91 additions and 9 deletions
|
@ -14,6 +14,9 @@
|
|||
"template.mistral": { "message": "Mistral" },
|
||||
"template.custom": { "message": "Custom" },
|
||||
"options.save": { "message": "Save" },
|
||||
"options.debugLogging": { "message": "Enable debug logging" }
|
||||
,"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" }
|
||||
"options.debugLogging": { "message": "Enable debug logging" },
|
||||
"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" },
|
||||
"options.stripUrlParams": { "message": "Remove URL tracking parameters" },
|
||||
"options.altTextImages": { "message": "Replace images with alt text" },
|
||||
"options.collapseWhitespace": { "message": "Collapse long whitespace" }
|
||||
}
|
||||
|
|
|
@ -23,6 +23,9 @@ let timingStats = { count: 0, mean: 0, m2: 0, total: 0, last: -1 };
|
|||
let currentStart = 0;
|
||||
let logGetTiming = true;
|
||||
let htmlToMarkdown = false;
|
||||
let stripUrlParams = false;
|
||||
let altTextImages = false;
|
||||
let collapseWhitespace = false;
|
||||
let TurndownService = null;
|
||||
|
||||
function setIcon(path) {
|
||||
|
@ -58,6 +61,20 @@ function replaceInlineBase64(text) {
|
|||
m => `[base64: ${byteSize(m)} bytes]`);
|
||||
}
|
||||
|
||||
function sanitizeString(text) {
|
||||
let t = String(text);
|
||||
if (stripUrlParams) {
|
||||
t = t.replace(/https?:\/\/[^\s)]+/g, m => {
|
||||
const idx = m.indexOf('?');
|
||||
return idx >= 0 ? m.slice(0, idx) : m;
|
||||
});
|
||||
}
|
||||
if (collapseWhitespace) {
|
||||
t = t.replace(/[ \t\u00A0]{2,}/g, ' ').replace(/\n{3,}/g, '\n\n');
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
function collectText(part, bodyParts, attachments) {
|
||||
if (part.parts && part.parts.length) {
|
||||
for (const p of part.parts) collectText(p, bodyParts, attachments);
|
||||
|
@ -72,19 +89,35 @@ function collectText(part, bodyParts, attachments) {
|
|||
attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`);
|
||||
} else if (ct.startsWith("text/html")) {
|
||||
const doc = new DOMParser().parseFromString(body, 'text/html');
|
||||
if (altTextImages) {
|
||||
doc.querySelectorAll('img').forEach(img => {
|
||||
const alt = img.getAttribute('alt') || '';
|
||||
img.replaceWith(doc.createTextNode(alt));
|
||||
});
|
||||
}
|
||||
if (stripUrlParams) {
|
||||
doc.querySelectorAll('[href]').forEach(a => {
|
||||
const href = a.getAttribute('href');
|
||||
if (href) a.setAttribute('href', href.split('?')[0]);
|
||||
});
|
||||
doc.querySelectorAll('[src]').forEach(e => {
|
||||
const src = e.getAttribute('src');
|
||||
if (src) e.setAttribute('src', src.split('?')[0]);
|
||||
});
|
||||
}
|
||||
if (htmlToMarkdown && TurndownService) {
|
||||
try {
|
||||
const td = new TurndownService();
|
||||
const md = td.turndown(doc.body.innerHTML || body);
|
||||
const md = sanitizeString(td.turndown(doc.body.innerHTML || body));
|
||||
bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`));
|
||||
} catch (e) {
|
||||
bodyParts.push(replaceInlineBase64(doc.body.textContent || ""));
|
||||
bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
|
||||
}
|
||||
} else {
|
||||
bodyParts.push(replaceInlineBase64(doc.body.textContent || ""));
|
||||
bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || "")));
|
||||
}
|
||||
} else {
|
||||
bodyParts.push(replaceInlineBase64(body));
|
||||
bodyParts.push(replaceInlineBase64(sanitizeString(body)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,7 +129,8 @@ function buildEmailText(full) {
|
|||
.map(([k,v]) => `${k}: ${v.join(' ')}`)
|
||||
.join('\n');
|
||||
const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : "");
|
||||
return `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
|
||||
const combined = `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim();
|
||||
return sanitizeString(combined);
|
||||
}
|
||||
async function applyAiRules(idsInput) {
|
||||
const ids = Array.isArray(idsInput) ? idsInput : [idsInput];
|
||||
|
@ -233,11 +267,14 @@ async function clearCacheForMessages(idsInput) {
|
|||
}
|
||||
|
||||
try {
|
||||
const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "aiRules"]);
|
||||
const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "stripUrlParams", "altTextImages", "collapseWhitespace", "aiRules"]);
|
||||
logger.setDebug(store.debugLogging);
|
||||
await AiClassifier.setConfig(store);
|
||||
await AiClassifier.init();
|
||||
htmlToMarkdown = store.htmlToMarkdown === true;
|
||||
stripUrlParams = store.stripUrlParams === true;
|
||||
altTextImages = store.altTextImages === true;
|
||||
collapseWhitespace = store.collapseWhitespace === true;
|
||||
const savedStats = await storage.local.get('classifyStats');
|
||||
if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') {
|
||||
Object.assign(timingStats, savedStats.classifyStats);
|
||||
|
@ -273,6 +310,18 @@ async function clearCacheForMessages(idsInput) {
|
|||
htmlToMarkdown = changes.htmlToMarkdown.newValue === true;
|
||||
logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown);
|
||||
}
|
||||
if (changes.stripUrlParams) {
|
||||
stripUrlParams = changes.stripUrlParams.newValue === true;
|
||||
logger.aiLog("stripUrlParams updated from storage change", {debug: true}, stripUrlParams);
|
||||
}
|
||||
if (changes.altTextImages) {
|
||||
altTextImages = changes.altTextImages.newValue === true;
|
||||
logger.aiLog("altTextImages updated from storage change", {debug: true}, altTextImages);
|
||||
}
|
||||
if (changes.collapseWhitespace) {
|
||||
collapseWhitespace = changes.collapseWhitespace.newValue === true;
|
||||
logger.aiLog("collapseWhitespace updated from storage change", {debug: true}, collapseWhitespace);
|
||||
}
|
||||
});
|
||||
} catch (err) {
|
||||
logger.aiLog("failed to load config", {level: 'error'}, err);
|
||||
|
|
|
@ -103,6 +103,21 @@
|
|||
<input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown
|
||||
</label>
|
||||
</div>
|
||||
<div class="field">
|
||||
<label class="checkbox">
|
||||
<input type="checkbox" id="strip-url-params"> Remove URL tracking parameters
|
||||
</label>
|
||||
</div>
|
||||
<div class="field">
|
||||
<label class="checkbox">
|
||||
<input type="checkbox" id="alt-text-images"> Replace images with alt text
|
||||
</label>
|
||||
</div>
|
||||
<div class="field">
|
||||
<label class="checkbox">
|
||||
<input type="checkbox" id="collapse-whitespace"> Collapse long whitespace
|
||||
</label>
|
||||
</div>
|
||||
<div class="field">
|
||||
<label class="label" for="max_tokens">Max tokens</label>
|
||||
<div class="control">
|
||||
|
|
|
@ -10,6 +10,9 @@ document.addEventListener('DOMContentLoaded', async () => {
|
|||
'aiParams',
|
||||
'debugLogging',
|
||||
'htmlToMarkdown',
|
||||
'stripUrlParams',
|
||||
'altTextImages',
|
||||
'collapseWhitespace',
|
||||
'aiRules',
|
||||
'aiCache'
|
||||
]);
|
||||
|
@ -85,6 +88,15 @@ document.addEventListener('DOMContentLoaded', async () => {
|
|||
const htmlToggle = document.getElementById('html-to-markdown');
|
||||
htmlToggle.checked = defaults.htmlToMarkdown === true;
|
||||
|
||||
const stripUrlToggle = document.getElementById('strip-url-params');
|
||||
stripUrlToggle.checked = defaults.stripUrlParams === true;
|
||||
|
||||
const altTextToggle = document.getElementById('alt-text-images');
|
||||
altTextToggle.checked = defaults.altTextImages === true;
|
||||
|
||||
const collapseWhitespaceToggle = document.getElementById('collapse-whitespace');
|
||||
collapseWhitespaceToggle.checked = defaults.collapseWhitespace === true;
|
||||
|
||||
const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {});
|
||||
for (const [key, val] of Object.entries(aiParams)) {
|
||||
const el = document.getElementById(key);
|
||||
|
@ -418,7 +430,10 @@ document.addEventListener('DOMContentLoaded', async () => {
|
|||
const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked;
|
||||
return { criterion, actions, stopProcessing };
|
||||
}).filter(r => r.criterion);
|
||||
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, aiRules: rules });
|
||||
const stripUrlParams = stripUrlToggle.checked;
|
||||
const altTextImages = altTextToggle.checked;
|
||||
const collapseWhitespace = collapseWhitespaceToggle.checked;
|
||||
await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, stripUrlParams, altTextImages, collapseWhitespace, aiRules: rules });
|
||||
try {
|
||||
await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging });
|
||||
logger.setDebug(debugLogging);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue