Add HTML sanitization options
This commit is contained in:
		
					parent
					
						
							
								b160f2221e
							
						
					
				
			
			
				commit
				
					
						149ff03cf9
					
				
			
		
					 4 changed files with 91 additions and 9 deletions
				
			
		|  | @ -14,6 +14,9 @@ | ||||||
|   "template.mistral": { "message": "Mistral" }, |   "template.mistral": { "message": "Mistral" }, | ||||||
|   "template.custom": { "message": "Custom" }, |   "template.custom": { "message": "Custom" }, | ||||||
|   "options.save": { "message": "Save" }, |   "options.save": { "message": "Save" }, | ||||||
|   "options.debugLogging": { "message": "Enable debug logging" } |   "options.debugLogging": { "message": "Enable debug logging" }, | ||||||
|   ,"options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" } |   "options.htmlToMarkdown": { "message": "Convert HTML body to Markdown" }, | ||||||
|  |   "options.stripUrlParams": { "message": "Remove URL tracking parameters" }, | ||||||
|  |   "options.altTextImages": { "message": "Replace images with alt text" }, | ||||||
|  |   "options.collapseWhitespace": { "message": "Collapse long whitespace" } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -23,6 +23,9 @@ let timingStats = { count: 0, mean: 0, m2: 0, total: 0, last: -1 }; | ||||||
| let currentStart = 0; | let currentStart = 0; | ||||||
| let logGetTiming = true; | let logGetTiming = true; | ||||||
| let htmlToMarkdown = false; | let htmlToMarkdown = false; | ||||||
|  | let stripUrlParams = false; | ||||||
|  | let altTextImages = false; | ||||||
|  | let collapseWhitespace = false; | ||||||
| let TurndownService = null; | let TurndownService = null; | ||||||
| 
 | 
 | ||||||
| function setIcon(path) { | function setIcon(path) { | ||||||
|  | @ -58,6 +61,20 @@ function replaceInlineBase64(text) { | ||||||
|         m => `[base64: ${byteSize(m)} bytes]`); |         m => `[base64: ${byteSize(m)} bytes]`); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | function sanitizeString(text) { | ||||||
|  |     let t = String(text); | ||||||
|  |     if (stripUrlParams) { | ||||||
|  |         t = t.replace(/https?:\/\/[^\s)]+/g, m => { | ||||||
|  |             const idx = m.indexOf('?'); | ||||||
|  |             return idx >= 0 ? m.slice(0, idx) : m; | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  |     if (collapseWhitespace) { | ||||||
|  |         t = t.replace(/[ \t\u00A0]{2,}/g, ' ').replace(/\n{3,}/g, '\n\n'); | ||||||
|  |     } | ||||||
|  |     return t; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| function collectText(part, bodyParts, attachments) { | function collectText(part, bodyParts, attachments) { | ||||||
|     if (part.parts && part.parts.length) { |     if (part.parts && part.parts.length) { | ||||||
|         for (const p of part.parts) collectText(p, bodyParts, attachments); |         for (const p of part.parts) collectText(p, bodyParts, attachments); | ||||||
|  | @ -72,19 +89,35 @@ function collectText(part, bodyParts, attachments) { | ||||||
|         attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`); |         attachments.push(`${name} (${ct}, ${part.size || byteSize(body)} bytes)`); | ||||||
|     } else if (ct.startsWith("text/html")) { |     } else if (ct.startsWith("text/html")) { | ||||||
|         const doc = new DOMParser().parseFromString(body, 'text/html'); |         const doc = new DOMParser().parseFromString(body, 'text/html'); | ||||||
|  |         if (altTextImages) { | ||||||
|  |             doc.querySelectorAll('img').forEach(img => { | ||||||
|  |                 const alt = img.getAttribute('alt') || ''; | ||||||
|  |                 img.replaceWith(doc.createTextNode(alt)); | ||||||
|  |             }); | ||||||
|  |         } | ||||||
|  |         if (stripUrlParams) { | ||||||
|  |             doc.querySelectorAll('[href]').forEach(a => { | ||||||
|  |                 const href = a.getAttribute('href'); | ||||||
|  |                 if (href) a.setAttribute('href', href.split('?')[0]); | ||||||
|  |             }); | ||||||
|  |             doc.querySelectorAll('[src]').forEach(e => { | ||||||
|  |                 const src = e.getAttribute('src'); | ||||||
|  |                 if (src) e.setAttribute('src', src.split('?')[0]); | ||||||
|  |             }); | ||||||
|  |         } | ||||||
|         if (htmlToMarkdown && TurndownService) { |         if (htmlToMarkdown && TurndownService) { | ||||||
|             try { |             try { | ||||||
|                 const td = new TurndownService(); |                 const td = new TurndownService(); | ||||||
|                 const md = td.turndown(doc.body.innerHTML || body); |                 const md = sanitizeString(td.turndown(doc.body.innerHTML || body)); | ||||||
|                 bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`)); |                 bodyParts.push(replaceInlineBase64(`[HTML Body converted to Markdown]\n${md}`)); | ||||||
|             } catch (e) { |             } catch (e) { | ||||||
|                 bodyParts.push(replaceInlineBase64(doc.body.textContent || "")); |                 bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || ""))); | ||||||
|             } |             } | ||||||
|         } else { |         } else { | ||||||
|             bodyParts.push(replaceInlineBase64(doc.body.textContent || "")); |             bodyParts.push(replaceInlineBase64(sanitizeString(doc.body.textContent || ""))); | ||||||
|         } |         } | ||||||
|     } else { |     } else { | ||||||
|         bodyParts.push(replaceInlineBase64(body)); |         bodyParts.push(replaceInlineBase64(sanitizeString(body))); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -96,7 +129,8 @@ function buildEmailText(full) { | ||||||
|         .map(([k,v]) => `${k}: ${v.join(' ')}`) |         .map(([k,v]) => `${k}: ${v.join(' ')}`) | ||||||
|         .join('\n'); |         .join('\n'); | ||||||
|     const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : ""); |     const attachInfo = `Attachments: ${attachments.length}` + (attachments.length ? "\n" + attachments.map(a => ` - ${a}`).join('\n') : ""); | ||||||
|     return `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim(); |     const combined = `${headers}\n${attachInfo}\n\n${bodyParts.join('\n')}`.trim(); | ||||||
|  |     return sanitizeString(combined); | ||||||
| } | } | ||||||
| async function applyAiRules(idsInput) { | async function applyAiRules(idsInput) { | ||||||
|     const ids = Array.isArray(idsInput) ? idsInput : [idsInput]; |     const ids = Array.isArray(idsInput) ? idsInput : [idsInput]; | ||||||
|  | @ -233,11 +267,14 @@ async function clearCacheForMessages(idsInput) { | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     try { |     try { | ||||||
|         const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "aiRules"]); |         const store = await storage.local.get(["endpoint", "templateName", "customTemplate", "customSystemPrompt", "aiParams", "debugLogging", "htmlToMarkdown", "stripUrlParams", "altTextImages", "collapseWhitespace", "aiRules"]); | ||||||
|         logger.setDebug(store.debugLogging); |         logger.setDebug(store.debugLogging); | ||||||
|         await AiClassifier.setConfig(store); |         await AiClassifier.setConfig(store); | ||||||
|         await AiClassifier.init(); |         await AiClassifier.init(); | ||||||
|         htmlToMarkdown = store.htmlToMarkdown === true; |         htmlToMarkdown = store.htmlToMarkdown === true; | ||||||
|  |         stripUrlParams = store.stripUrlParams === true; | ||||||
|  |         altTextImages = store.altTextImages === true; | ||||||
|  |         collapseWhitespace = store.collapseWhitespace === true; | ||||||
|         const savedStats = await storage.local.get('classifyStats'); |         const savedStats = await storage.local.get('classifyStats'); | ||||||
|         if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') { |         if (savedStats.classifyStats && typeof savedStats.classifyStats === 'object') { | ||||||
|             Object.assign(timingStats, savedStats.classifyStats); |             Object.assign(timingStats, savedStats.classifyStats); | ||||||
|  | @ -273,6 +310,18 @@ async function clearCacheForMessages(idsInput) { | ||||||
|                 htmlToMarkdown = changes.htmlToMarkdown.newValue === true; |                 htmlToMarkdown = changes.htmlToMarkdown.newValue === true; | ||||||
|                 logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown); |                 logger.aiLog("htmlToMarkdown updated from storage change", {debug: true}, htmlToMarkdown); | ||||||
|             } |             } | ||||||
|  |             if (changes.stripUrlParams) { | ||||||
|  |                 stripUrlParams = changes.stripUrlParams.newValue === true; | ||||||
|  |                 logger.aiLog("stripUrlParams updated from storage change", {debug: true}, stripUrlParams); | ||||||
|  |             } | ||||||
|  |             if (changes.altTextImages) { | ||||||
|  |                 altTextImages = changes.altTextImages.newValue === true; | ||||||
|  |                 logger.aiLog("altTextImages updated from storage change", {debug: true}, altTextImages); | ||||||
|  |             } | ||||||
|  |             if (changes.collapseWhitespace) { | ||||||
|  |                 collapseWhitespace = changes.collapseWhitespace.newValue === true; | ||||||
|  |                 logger.aiLog("collapseWhitespace updated from storage change", {debug: true}, collapseWhitespace); | ||||||
|  |             } | ||||||
|         }); |         }); | ||||||
|     } catch (err) { |     } catch (err) { | ||||||
|         logger.aiLog("failed to load config", {level: 'error'}, err); |         logger.aiLog("failed to load config", {level: 'error'}, err); | ||||||
|  |  | ||||||
|  | @ -103,6 +103,21 @@ | ||||||
|                             <input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown |                             <input type="checkbox" id="html-to-markdown"> Convert HTML body to Markdown | ||||||
|                         </label> |                         </label> | ||||||
|                     </div> |                     </div> | ||||||
|  |                     <div class="field"> | ||||||
|  |                         <label class="checkbox"> | ||||||
|  |                             <input type="checkbox" id="strip-url-params"> Remove URL tracking parameters | ||||||
|  |                         </label> | ||||||
|  |                     </div> | ||||||
|  |                     <div class="field"> | ||||||
|  |                         <label class="checkbox"> | ||||||
|  |                             <input type="checkbox" id="alt-text-images"> Replace images with alt text | ||||||
|  |                         </label> | ||||||
|  |                     </div> | ||||||
|  |                     <div class="field"> | ||||||
|  |                         <label class="checkbox"> | ||||||
|  |                             <input type="checkbox" id="collapse-whitespace"> Collapse long whitespace | ||||||
|  |                         </label> | ||||||
|  |                     </div> | ||||||
|                     <div class="field"> |                     <div class="field"> | ||||||
|                         <label class="label" for="max_tokens">Max tokens</label> |                         <label class="label" for="max_tokens">Max tokens</label> | ||||||
|                         <div class="control"> |                         <div class="control"> | ||||||
|  |  | ||||||
|  | @ -10,6 +10,9 @@ document.addEventListener('DOMContentLoaded', async () => { | ||||||
|         'aiParams', |         'aiParams', | ||||||
|         'debugLogging', |         'debugLogging', | ||||||
|         'htmlToMarkdown', |         'htmlToMarkdown', | ||||||
|  |         'stripUrlParams', | ||||||
|  |         'altTextImages', | ||||||
|  |         'collapseWhitespace', | ||||||
|         'aiRules', |         'aiRules', | ||||||
|         'aiCache' |         'aiCache' | ||||||
|     ]); |     ]); | ||||||
|  | @ -85,6 +88,15 @@ document.addEventListener('DOMContentLoaded', async () => { | ||||||
|     const htmlToggle = document.getElementById('html-to-markdown'); |     const htmlToggle = document.getElementById('html-to-markdown'); | ||||||
|     htmlToggle.checked = defaults.htmlToMarkdown === true; |     htmlToggle.checked = defaults.htmlToMarkdown === true; | ||||||
| 
 | 
 | ||||||
|  |     const stripUrlToggle = document.getElementById('strip-url-params'); | ||||||
|  |     stripUrlToggle.checked = defaults.stripUrlParams === true; | ||||||
|  | 
 | ||||||
|  |     const altTextToggle = document.getElementById('alt-text-images'); | ||||||
|  |     altTextToggle.checked = defaults.altTextImages === true; | ||||||
|  | 
 | ||||||
|  |     const collapseWhitespaceToggle = document.getElementById('collapse-whitespace'); | ||||||
|  |     collapseWhitespaceToggle.checked = defaults.collapseWhitespace === true; | ||||||
|  | 
 | ||||||
|     const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {}); |     const aiParams = Object.assign({}, DEFAULT_AI_PARAMS, defaults.aiParams || {}); | ||||||
|     for (const [key, val] of Object.entries(aiParams)) { |     for (const [key, val] of Object.entries(aiParams)) { | ||||||
|         const el = document.getElementById(key); |         const el = document.getElementById(key); | ||||||
|  | @ -418,7 +430,10 @@ document.addEventListener('DOMContentLoaded', async () => { | ||||||
|             const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked; |             const stopProcessing = ruleEl.querySelector('.stop-processing')?.checked; | ||||||
|             return { criterion, actions, stopProcessing }; |             return { criterion, actions, stopProcessing }; | ||||||
|         }).filter(r => r.criterion); |         }).filter(r => r.criterion); | ||||||
|         await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, aiRules: rules }); |         const stripUrlParams = stripUrlToggle.checked; | ||||||
|  |         const altTextImages = altTextToggle.checked; | ||||||
|  |         const collapseWhitespace = collapseWhitespaceToggle.checked; | ||||||
|  |         await storage.local.set({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging, htmlToMarkdown, stripUrlParams, altTextImages, collapseWhitespace, aiRules: rules }); | ||||||
|         try { |         try { | ||||||
|             await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging }); |             await AiClassifier.setConfig({ endpoint, templateName, customTemplate: customTemplateText, customSystemPrompt, aiParams: aiParamsSave, debugLogging }); | ||||||
|             logger.setDebug(debugLogging); |             logger.setDebug(debugLogging); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue