From: john Date: Sun, 30 Sep 2001 16:50:24 +0000 (+0000) Subject: added removeHTMLTags and approveHTMLTags. the latter is non-functional and will... X-Git-Tag: prexmlproducerconfig~387 X-Git-Url: http://erislabs.org.uk/gitweb/?a=commitdiff_plain;h=332d9ec2fc7783bbae58f5eda8b452edf6afc826;p=mir.git added removeHTMLTags and approveHTMLTags. the latter is non-functional and will cause errors if called....but the patterns are ok (and work on perl) must debug gnu.regexp problems before it is ready for use. --- diff --git a/source/mir/misc/StringUtil.java b/source/mir/misc/StringUtil.java index 057b201b..09a4b7c8 100755 --- a/source/mir/misc/StringUtil.java +++ b/source/mir/misc/StringUtil.java @@ -878,6 +878,63 @@ public final class StringUtil { return null; } } + + /** + * this method deletes all html tags + * + */ + + public static String removeHTMLTags(String haystack){ +try { + RE regex = new RE("<[^>]*>",RE.REG_ICASE); + haystack = regex.substituteAll(haystack,""); + + return haystack; + } catch(REException ex){ + return null; + } + + + } + + /** + * this method deletes all but the approved tags html tags + * it also deletes approved tags which contain malicious-looking attributes and doesn't work at all + */ + + + public static String approveHTMLTags(String haystack){ + try { + String approvedTags="a|img|h1|h2|h3|h4|h5|h6|br|b|i|strong|p"; + String badAttributes="onAbort|onBlur|onChange|onClick|onDblClick|onDragDrop|onError|onFocus|onKeyDown|onKeyPress|onKeyUp|onLoad|onMouseDown|onMouseMove|onMouseOut|onMouseOver|onMouseUp|onMove|onReset|onResize|onSelect|onSubmit|onUnload"; + String approvedProtocols="rtsp|http|ftp|https|freenet|mailto"; + + // kill all the bad tags that have attributes + + RE regex = new RE("<\\s*/?\\s*(?!(("+approvedTags+")\\s))\\w+\\s[^>]*>",RE.REG_ICASE); + haystack = regex.substituteAll(haystack,""); + + // kill all the bad tags that are attributeless + regex = new RE("<\\s*/?\\s*(?!(("+approvedTags+")\\s*>))\\w+\\s*>",RE.REG_ICASE); + haystack = regex.substituteAll(haystack,""); + + // kill all the tags which have a javascript attribute like onLoad + regex = new RE("<[^>]*("+badAttributes+")[^>]*>",RE.REG_ICASE); + haystack = regex.substituteAll(haystack,""); + + // kill all the tags which include a url to an unacceptable protocol + regex = new RE("<\\s*a\\s+[^>]*href=(?!(\'|\")?("+approvedProtocols+"))[^>]*>",RE.REG_ICASE); + haystack = regex.substituteAll(haystack,""); + + return haystack; + } catch(REException ex){ + //return ex.toString(); + return null; + } + + + } + /** * createHTML ruft alle regex-methoden zum unwandeln eines nicht