Coverage Report - net.sourceforge.pebble.util.StringUtils
 
Classes in this File Line Coverage Branch Coverage Complexity
StringUtils
99%
408/410
95%
47/49
4.6
 
 1  
 /*
 2  
  * Copyright (c) 2003-2011, Simon Brown
 3  
  * All rights reserved.
 4  
  *
 5  
  * Redistribution and use in source and binary forms, with or without
 6  
  * modification, are permitted provided that the following conditions are met:
 7  
  *
 8  
  *   - Redistributions of source code must retain the above copyright
 9  
  *     notice, this list of conditions and the following disclaimer.
 10  
  *
 11  
  *   - Redistributions in binary form must reproduce the above copyright
 12  
  *     notice, this list of conditions and the following disclaimer in
 13  
  *     the documentation and/or other materials provided with the
 14  
  *     distribution.
 15  
  *
 16  
  *   - Neither the name of Pebble nor the names of its contributors may
 17  
  *     be used to endorse or promote products derived from this software
 18  
  *     without specific prior written permission.
 19  
  *
 20  
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21  
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22  
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 23  
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 24  
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 25  
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 26  
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 27  
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 28  
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 29  
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 30  
  * POSSIBILITY OF SUCH DAMAGE.
 31  
  */
 32  
 package net.sourceforge.pebble.util;
 33  
 
 34  
 import java.util.*;
 35  
 import java.util.regex.Matcher;
 36  
 import java.util.regex.Pattern;
 37  
 
 38  
 /**
 39  
  * A collection of utility methods for manipulating strings.
 40  
  *
 41  
  * @author    Simon Brown
 42  
  */
 43  0
 public final class StringUtils {
 44  
 
 45  4
   private static final Pattern OPENING_B_TAG_PATTERN = Pattern.compile("<b>", Pattern.CASE_INSENSITIVE);
 46  4
   private static final Pattern CLOSING_B_TAG_PATTERN = Pattern.compile("</b>", Pattern.CASE_INSENSITIVE);
 47  4
   private static final Pattern OPENING_STRONG_TAG_PATTERN = Pattern.compile("<strong>", Pattern.CASE_INSENSITIVE);
 48  4
   private static final Pattern CLOSING_STRONG_TAG_PATTERN = Pattern.compile("</strong>", Pattern.CASE_INSENSITIVE);
 49  4
   private static final Pattern OPENING_I_TAG_PATTERN = Pattern.compile("<i>", Pattern.CASE_INSENSITIVE);
 50  4
   private static final Pattern CLOSING_I_TAG_PATTERN = Pattern.compile("</i>", Pattern.CASE_INSENSITIVE);
 51  4
   private static final Pattern OPENING_EM_TAG_PATTERN = Pattern.compile("<em>", Pattern.CASE_INSENSITIVE);
 52  4
   private static final Pattern CLOSING_EM_TAG_PATTERN = Pattern.compile("</em>", Pattern.CASE_INSENSITIVE);
 53  4
   private static final Pattern OPENING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("<blockquote>", Pattern.CASE_INSENSITIVE);
 54  4
   private static final Pattern CLOSING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
 55  4
   private static final Pattern BR_TAG_PATTERN = Pattern.compile("<br */*>", Pattern.CASE_INSENSITIVE);
 56  4
   private static final Pattern OPENING_P_TAG_PATTERN = Pattern.compile("<p>", Pattern.CASE_INSENSITIVE);
 57  4
   private static final Pattern CLOSING_P_TAG_PATTERN = Pattern.compile("</p>", Pattern.CASE_INSENSITIVE);
 58  4
   private static final Pattern OPENING_PRE_TAG_PATTERN = Pattern.compile("<pre>", Pattern.CASE_INSENSITIVE);
 59  4
   private static final Pattern CLOSING_PRE_TAG_PATTERN = Pattern.compile("</pre>", Pattern.CASE_INSENSITIVE);
 60  4
   private static final Pattern OPENING_UL_TAG_PATTERN = Pattern.compile("<ul>", Pattern.CASE_INSENSITIVE);
 61  4
   private static final Pattern CLOSING_UL_TAG_PATTERN = Pattern.compile("</ul>", Pattern.CASE_INSENSITIVE);
 62  4
   private static final Pattern OPENING_OL_TAG_PATTERN = Pattern.compile("<ol>", Pattern.CASE_INSENSITIVE);
 63  4
   private static final Pattern CLOSING_OL_TAG_PATTERN = Pattern.compile("</ol>", Pattern.CASE_INSENSITIVE);
 64  4
   private static final Pattern OPENING_LI_TAG_PATTERN = Pattern.compile("<li>", Pattern.CASE_INSENSITIVE);
 65  4
   private static final Pattern CLOSING_LI_TAG_PATTERN = Pattern.compile("</li>", Pattern.CASE_INSENSITIVE);
 66  4
   private static final Pattern CLOSING_A_TAG_PATTERN = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE);
 67  4
   private static final Pattern OPENING_A_TAG_PATTERN = Pattern.compile("<a href=.*?>", Pattern.CASE_INSENSITIVE);
 68  4
   private static final Pattern OPENING_SUP_TAG_PATTERN = Pattern.compile("<sup>", Pattern.CASE_INSENSITIVE);
 69  4
   private static final Pattern CLOSING_SUP_TAG_PATTERN = Pattern.compile("</sup>", Pattern.CASE_INSENSITIVE);
 70  4
   private static final Pattern OPENING_SUB_TAG_PATTERN = Pattern.compile("<sub>", Pattern.CASE_INSENSITIVE);
 71  4
   private static final Pattern CLOSING_SUB_TAG_PATTERN = Pattern.compile("</sub>", Pattern.CASE_INSENSITIVE);
 72  
 
 73  
   public static final int MAX_CONTENT_LENGTH = 255;
 74  
   public static final int MAX_WORD_LENGTH = 20;
 75  
   public static final int MAX_NUM_OF_POSTS = 5;
 76  
 
 77  
   
 78  
   //HTML4 248 named entities
 79  4
   private final static Map<String,String> htmlEntities = new HashMap<String,String>();
 80  4
   private final static Collection<String> allowedSchemes = new ArrayList<String>();
 81  
 
 82  
   static {
 83  4
         htmlEntities.put("&nbsp;", "\u00A0");
 84  4
         htmlEntities.put("&iexcl;", "\u00A1");
 85  4
         htmlEntities.put("&cent;", "\u00A2");
 86  4
         htmlEntities.put("&pound;", "\u00A3");
 87  4
         htmlEntities.put("&curren;", "\u00A4");
 88  4
         htmlEntities.put("&yen;", "\u00A5");
 89  4
         htmlEntities.put("&brvbar;", "\u00A6");
 90  4
         htmlEntities.put("&sect;", "\u00A7");
 91  4
         htmlEntities.put("&uml;", "\u00A8");
 92  4
         htmlEntities.put("&copy;", "\u00A9");
 93  4
         htmlEntities.put("&ordf;", "\u00AA");
 94  4
         htmlEntities.put("&laquo;", "\u00AB");
 95  4
         htmlEntities.put("&not;", "\u00AC");
 96  4
         htmlEntities.put("&shy;", "\u00AD");
 97  4
         htmlEntities.put("&reg;", "\u00AE");
 98  4
         htmlEntities.put("&macr;", "\u00AF");
 99  4
         htmlEntities.put("&deg;", "\u00B0");
 100  4
         htmlEntities.put("&plusmn;", "\u00B1");
 101  4
         htmlEntities.put("&sup2;", "\u00B2");
 102  4
         htmlEntities.put("&sup3;", "\u00B3");
 103  4
         htmlEntities.put("&acute;", "\u00B4");
 104  4
         htmlEntities.put("&micro;", "\u00B5");
 105  4
         htmlEntities.put("&para;", "\u00B6");
 106  4
         htmlEntities.put("&middot;", "\u00B7");
 107  4
         htmlEntities.put("&cedil;", "\u00B8");
 108  4
         htmlEntities.put("&sup1;", "\u00B9");
 109  4
         htmlEntities.put("&ordm;", "\u00BA");
 110  4
         htmlEntities.put("&raquo;", "\u00BB");
 111  4
         htmlEntities.put("&frac14;", "\u00BC");
 112  4
         htmlEntities.put("&frac12;", "\u00BD");
 113  4
         htmlEntities.put("&frac34;", "\u00BE");
 114  4
         htmlEntities.put("&iquest;", "\u00BF");
 115  4
         htmlEntities.put("&Agrave;", "\u00C0");
 116  4
         htmlEntities.put("&Aacute;", "\u00C1");
 117  4
         htmlEntities.put("&Acirc;", "\u00C2");
 118  4
         htmlEntities.put("&Atilde;", "\u00C3");
 119  4
         htmlEntities.put("&Auml;", "\u00C4");
 120  4
         htmlEntities.put("&Aring;", "\u00C5");
 121  4
         htmlEntities.put("&AElig;", "\u00C6");
 122  4
         htmlEntities.put("&Ccedil;", "\u00C7");
 123  4
         htmlEntities.put("&Egrave;", "\u00C8");
 124  4
         htmlEntities.put("&Eacute;", "\u00C9");
 125  4
         htmlEntities.put("&Ecirc;", "\u00CA");
 126  4
         htmlEntities.put("&Euml;", "\u00CB");
 127  4
         htmlEntities.put("&Igrave;", "\u00CC");
 128  4
         htmlEntities.put("&Iacute;", "\u00CD");
 129  4
         htmlEntities.put("&Icirc;", "\u00CE");
 130  4
         htmlEntities.put("&Iuml;", "\u00CF");
 131  4
         htmlEntities.put("&ETH;", "\u00D0");
 132  4
         htmlEntities.put("&Ntilde;", "\u00D1");
 133  4
         htmlEntities.put("&Ograve;", "\u00D2");
 134  4
         htmlEntities.put("&Oacute;", "\u00D3");
 135  4
         htmlEntities.put("&Ocirc;", "\u00D4");
 136  4
         htmlEntities.put("&Otilde;", "\u00D5");
 137  4
         htmlEntities.put("&Ouml;", "\u00D6");
 138  4
         htmlEntities.put("&times;", "\u00D7");
 139  4
         htmlEntities.put("&Oslash;", "\u00D8");
 140  4
         htmlEntities.put("&Ugrave;", "\u00D9");
 141  4
         htmlEntities.put("&Uacute;", "\u00DA");
 142  4
         htmlEntities.put("&Ucirc;", "\u00DB");
 143  4
         htmlEntities.put("&Uuml;", "\u00DC");
 144  4
         htmlEntities.put("&Yacute;", "\u00DD");
 145  4
         htmlEntities.put("&THORN;", "\u00DE");
 146  4
         htmlEntities.put("&szlig;", "\u00DF");
 147  4
         htmlEntities.put("&agrave;", "\u00E0");
 148  4
         htmlEntities.put("&aacute;", "\u00E1");
 149  4
         htmlEntities.put("&acirc;", "\u00E2");
 150  4
         htmlEntities.put("&atilde;", "\u00E3");
 151  4
         htmlEntities.put("&auml;", "\u00E4");
 152  4
         htmlEntities.put("&aring;", "\u00E5");
 153  4
         htmlEntities.put("&aelig;", "\u00E6");
 154  4
         htmlEntities.put("&ccedil;", "\u00E7");
 155  4
         htmlEntities.put("&egrave;", "\u00E8");
 156  4
         htmlEntities.put("&eacute;", "\u00E9");
 157  4
         htmlEntities.put("&ecirc;", "\u00EA");
 158  4
         htmlEntities.put("&euml;", "\u00EB");
 159  4
         htmlEntities.put("&igrave;", "\u00EC");
 160  4
         htmlEntities.put("&iacute;", "\u00ED");
 161  4
         htmlEntities.put("&icirc;", "\u00EE");
 162  4
         htmlEntities.put("&iuml;", "\u00EF");
 163  4
         htmlEntities.put("&eth;", "\u00F0");
 164  4
         htmlEntities.put("&ntilde;", "\u00F1");
 165  4
         htmlEntities.put("&ograve;", "\u00F2");
 166  4
         htmlEntities.put("&oacute;", "\u00F3");
 167  4
         htmlEntities.put("&ocirc;", "\u00F4");
 168  4
         htmlEntities.put("&otilde;", "\u00F5");
 169  4
         htmlEntities.put("&ouml;", "\u00F6");
 170  4
         htmlEntities.put("&divide;", "\u00F7");
 171  4
         htmlEntities.put("&oslash;", "\u00F8");
 172  4
         htmlEntities.put("&ugrave;", "\u00F9");
 173  4
         htmlEntities.put("&uacute;", "\u00FA");
 174  4
         htmlEntities.put("&ucirc;", "\u00FB");
 175  4
         htmlEntities.put("&uuml;", "\u00FC");
 176  4
         htmlEntities.put("&yacute;", "\u00FD");
 177  4
         htmlEntities.put("&thorn;", "\u00FE");
 178  4
         htmlEntities.put("&yuml;", "\u00FF");
 179  4
         htmlEntities.put("&OElig;", "\u0152");
 180  4
         htmlEntities.put("&oelig;", "\u0153");
 181  4
         htmlEntities.put("&Scaron;", "\u0160");
 182  4
         htmlEntities.put("&scaron;", "\u0161");
 183  4
         htmlEntities.put("&Yuml;", "\u0178");
 184  4
         htmlEntities.put("&fnof;", "\u0192");
 185  4
         htmlEntities.put("&circ;", "\u02C6");
 186  4
         htmlEntities.put("&tilde;", "\u02DC");
 187  4
         htmlEntities.put("&Alpha;", "\u0391");
 188  4
         htmlEntities.put("&Beta;", "\u0392");
 189  4
         htmlEntities.put("&Gamma;", "\u0393");
 190  4
         htmlEntities.put("&Delta;", "\u0394");
 191  4
         htmlEntities.put("&Epsilon;", "\u0395");
 192  4
         htmlEntities.put("&Zeta;", "\u0396");
 193  4
         htmlEntities.put("&Eta;", "\u0397");
 194  4
         htmlEntities.put("&Theta;", "\u0398");
 195  4
         htmlEntities.put("&Iota;", "\u0399");
 196  4
         htmlEntities.put("&Kappa;", "\u039A");
 197  4
         htmlEntities.put("&Lambda;", "\u039B");
 198  4
         htmlEntities.put("&Mu;", "\u039C");
 199  4
         htmlEntities.put("&Nu;", "\u039D");
 200  4
         htmlEntities.put("&Xi;", "\u039E");
 201  4
         htmlEntities.put("&Omicron;", "\u039F");
 202  4
         htmlEntities.put("&Pi;", "\u03A0");
 203  4
         htmlEntities.put("&Rho;", "\u03A1");
 204  4
         htmlEntities.put("&Sigma;", "\u03A3");
 205  4
         htmlEntities.put("&Tau;", "\u03A4");
 206  4
         htmlEntities.put("&Upsilon;", "\u03A5");
 207  4
         htmlEntities.put("&Phi;", "\u03A6");
 208  4
         htmlEntities.put("&Chi;", "\u03A7");
 209  4
         htmlEntities.put("&Psi;", "\u03A8");
 210  4
         htmlEntities.put("&Omega;", "\u03A9");
 211  4
         htmlEntities.put("&alpha;", "\u03B1");
 212  4
         htmlEntities.put("&beta;", "\u03B2");
 213  4
         htmlEntities.put("&gamma;", "\u03B3");
 214  4
         htmlEntities.put("&delta;", "\u03B4");
 215  4
         htmlEntities.put("&epsilon;", "\u03B5");
 216  4
         htmlEntities.put("&zeta;", "\u03B6");
 217  4
         htmlEntities.put("&eta;", "\u03B7");
 218  4
         htmlEntities.put("&theta;", "\u03B8");
 219  4
         htmlEntities.put("&iota;", "\u03B9");
 220  4
         htmlEntities.put("&kappa;", "\u03BA");
 221  4
         htmlEntities.put("&lambda;", "\u03BB");
 222  4
         htmlEntities.put("&mu;", "\u03BC");
 223  4
         htmlEntities.put("&nu;", "\u03BD");
 224  4
         htmlEntities.put("&xi;", "\u03BE");
 225  4
         htmlEntities.put("&omicron;", "\u03BF");
 226  4
         htmlEntities.put("&pi;", "\u03C0");
 227  4
         htmlEntities.put("&rho;", "\u03C1");
 228  4
         htmlEntities.put("&sigmaf;", "\u03C2");
 229  4
         htmlEntities.put("&sigma;", "\u03C3");
 230  4
         htmlEntities.put("&tau;", "\u03C4");
 231  4
         htmlEntities.put("&upsilon;", "\u03C5");
 232  4
         htmlEntities.put("&phi;", "\u03C6");
 233  4
         htmlEntities.put("&chi;", "\u03C7");
 234  4
         htmlEntities.put("&psi;", "\u03C8");
 235  4
         htmlEntities.put("&omega;", "\u03C9");
 236  4
         htmlEntities.put("&thetasym;", "\u03D1");
 237  4
         htmlEntities.put("&upsih;", "\u03D2");
 238  4
         htmlEntities.put("&piv;", "\u03D6");
 239  4
         htmlEntities.put("&ensp;", "\u2002");
 240  4
         htmlEntities.put("&emsp;", "\u2003");
 241  4
         htmlEntities.put("&thinsp;", "\u2009");
 242  4
         htmlEntities.put("&zwnj;", "\u200C");
 243  4
         htmlEntities.put("&zwj;", "\u200D");
 244  4
         htmlEntities.put("&lrm;", "\u200E");
 245  4
         htmlEntities.put("&rlm;", "\u200F");
 246  4
         htmlEntities.put("&ndash;", "\u2013");
 247  4
         htmlEntities.put("&mdash;", "\u2014");
 248  4
         htmlEntities.put("&lsquo;", "\u2018");
 249  4
         htmlEntities.put("&rsquo;", "\u2019");
 250  4
         htmlEntities.put("&sbquo;", "\u201A");
 251  4
         htmlEntities.put("&ldquo;", "\u201C");
 252  4
         htmlEntities.put("&rdquo;", "\u201D");
 253  4
         htmlEntities.put("&bdquo;", "\u201E");
 254  4
         htmlEntities.put("&dagger;", "\u2020");
 255  4
         htmlEntities.put("&Dagger;", "\u2021");
 256  4
         htmlEntities.put("&bull;", "\u2022");
 257  4
         htmlEntities.put("&hellip;", "\u2026");
 258  4
         htmlEntities.put("&permil;", "\u2030");
 259  4
         htmlEntities.put("&prime;", "\u2032");
 260  4
         htmlEntities.put("&Prime;", "\u2033");
 261  4
         htmlEntities.put("&lsaquo;", "\u2039");
 262  4
         htmlEntities.put("&rsaquo;", "\u203A");
 263  4
         htmlEntities.put("&oline;", "\u203E");
 264  4
         htmlEntities.put("&frasl;", "\u2044");
 265  4
         htmlEntities.put("&euro;", "\u20AC");
 266  4
         htmlEntities.put("&image;", "\u2111");
 267  4
         htmlEntities.put("&weierp;", "\u2118");
 268  4
         htmlEntities.put("&real;", "\u211C");
 269  4
         htmlEntities.put("&trade;", "\u2122");
 270  4
         htmlEntities.put("&alefsym;", "\u2135");
 271  4
         htmlEntities.put("&larr;", "\u2190");
 272  4
         htmlEntities.put("&uarr;", "\u2191");
 273  4
         htmlEntities.put("&rarr;", "\u2192");
 274  4
         htmlEntities.put("&darr;", "\u2193");
 275  4
         htmlEntities.put("&harr;", "\u2194");
 276  4
         htmlEntities.put("&crarr;", "\u21B5");
 277  4
         htmlEntities.put("&lArr;", "\u21D0");
 278  4
         htmlEntities.put("&uArr;", "\u21D1");
 279  4
         htmlEntities.put("&rArr;", "\u21D2");
 280  4
         htmlEntities.put("&dArr;", "\u21D3");
 281  4
         htmlEntities.put("&hArr;", "\u21D4");
 282  4
         htmlEntities.put("&forall;", "\u2200");
 283  4
         htmlEntities.put("&part;", "\u2202");
 284  4
         htmlEntities.put("&exist;", "\u2203");
 285  4
         htmlEntities.put("&empty;", "\u2205");
 286  4
         htmlEntities.put("&nabla;", "\u2207");
 287  4
         htmlEntities.put("&isin;", "\u2208");
 288  4
         htmlEntities.put("&notin;", "\u2209");
 289  4
         htmlEntities.put("&ni;", "\u220B");
 290  4
         htmlEntities.put("&prod;", "\u220F");
 291  4
         htmlEntities.put("&sum;", "\u2211");
 292  4
         htmlEntities.put("&minus;", "\u2212");
 293  4
         htmlEntities.put("&lowast;", "\u2217");
 294  4
         htmlEntities.put("&radic;", "\u221A");
 295  4
         htmlEntities.put("&prop;", "\u221D");
 296  4
         htmlEntities.put("&infin;", "\u221E");
 297  4
         htmlEntities.put("&ang;", "\u2220");
 298  4
         htmlEntities.put("&and;", "\u2227");
 299  4
         htmlEntities.put("&or;", "\u2228");
 300  4
         htmlEntities.put("&cap;", "\u2229");
 301  4
         htmlEntities.put("&cup;", "\u222A");
 302  4
         htmlEntities.put("&int;", "\u222B");
 303  4
         htmlEntities.put("&there4;", "\u2234");
 304  4
         htmlEntities.put("&sim;", "\u223C");
 305  4
         htmlEntities.put("&cong;", "\u2245");
 306  4
         htmlEntities.put("&asymp;", "\u2248");
 307  4
         htmlEntities.put("&ne;", "\u2260");
 308  4
         htmlEntities.put("&equiv;", "\u2261");
 309  4
         htmlEntities.put("&le;", "\u2264");
 310  4
         htmlEntities.put("&ge;", "\u2265");
 311  4
         htmlEntities.put("&sub;", "\u2282");
 312  4
         htmlEntities.put("&sup;", "\u2283");
 313  4
         htmlEntities.put("&nsub;", "\u2284");
 314  4
         htmlEntities.put("&sube;", "\u2286");
 315  4
         htmlEntities.put("&supe;", "\u2287");
 316  4
         htmlEntities.put("&oplus;", "\u2295");
 317  4
         htmlEntities.put("&otimes;", "\u2297");
 318  4
         htmlEntities.put("&perp;", "\u22A5");
 319  4
         htmlEntities.put("&sdot;", "\u22C5");
 320  4
         htmlEntities.put("&lceil;", "\u2308");
 321  4
         htmlEntities.put("&rceil;", "\u2309");
 322  4
         htmlEntities.put("&lfloor;", "\u230A");
 323  4
         htmlEntities.put("&rfloor;", "\u230B");
 324  4
         htmlEntities.put("&lang;", "\u2329");
 325  4
         htmlEntities.put("&rang;", "\u232A");
 326  4
         htmlEntities.put("&loz;", "\u25CA");
 327  4
         htmlEntities.put("&spades;", "\u2660");
 328  4
         htmlEntities.put("&clubs;", "\u2663");
 329  4
         htmlEntities.put("&hearts;", "\u2665");
 330  4
         htmlEntities.put("&diams;", "\u2666");
 331  
 
 332  4
   allowedSchemes.add("https://");
 333  4
   allowedSchemes.add("http://");
 334  4
   allowedSchemes.add("ftp://");
 335  4
   allowedSchemes.add("mailto:");
 336  4
   }
 337  
         
 338  
 
 339  
   /**
 340  
    * Filters out characters that have meaning within JSP and HTML, and
 341  
    * replaces them with "escaped" versions.
 342  
    *
 343  
    * @param s   the String to filter
 344  
    * @return  the filtered String
 345  
    */
 346  
   public static String transformHTML(String s) {
 347  
 
 348  5934
     if (s == null) {
 349  1524
       return null;
 350  
     }
 351  
 
 352  4410
     StringBuffer buf = new StringBuffer(s.length());
 353  
 
 354  
     // loop through every character and replace if necessary
 355  4410
     int length = s.length();
 356  30744
     for (int i = 0; i < length; i++) {
 357  26334
       switch (s.charAt(i)) {
 358  
         case '<':
 359  112
           buf.append("&lt;");
 360  112
           break;
 361  
         case '>':
 362  112
           buf.append("&gt;");
 363  112
           break;
 364  
         case '&':
 365  4
           buf.append("&amp;");
 366  4
           break;
 367  
         case '\"':
 368  8
             buf.append("&quot;");
 369  8
             break;
 370  
         default :
 371  26098
           buf.append(s.charAt(i));
 372  
       }
 373  
     }
 374  
 
 375  4410
     return buf.toString();
 376  
   }
 377  
 
 378  
   /**
 379  
    * Transforms the given String into a subset of HTML displayable on a web
 380  
    * page. The subset includes &lt;b&gt;, &lt;i&gt;, &lt;p&gt;, &lt;br&gt;,
 381  
    * &lt;pre&gt; and &lt;a href&gt; (and their corresponding end tags).
 382  
    *
 383  
    * @param s   the String to transform
 384  
    * @return    the transformed String
 385  
    */
 386  
   public static String transformToHTMLSubset(String s) {
 387  
 
 388  172
     if (s == null) {
 389  12
       return null;
 390  
     }
 391  
 
 392  160
     s = replace(s, OPENING_B_TAG_PATTERN, "<b>");
 393  160
     s = replace(s, CLOSING_B_TAG_PATTERN, "</b>");
 394  160
     s = replace(s, OPENING_STRONG_TAG_PATTERN, "<strong>");
 395  160
     s = replace(s, CLOSING_STRONG_TAG_PATTERN, "</strong>");
 396  160
     s = replace(s, OPENING_I_TAG_PATTERN, "<i>");
 397  160
     s = replace(s, CLOSING_I_TAG_PATTERN, "</i>");
 398  160
     s = replace(s, OPENING_EM_TAG_PATTERN, "<em>");
 399  160
     s = replace(s, CLOSING_EM_TAG_PATTERN, "</em>");
 400  160
     s = replace(s, OPENING_BLOCKQUOTE_TAG_PATTERN, "<blockquote>");
 401  160
     s = replace(s, CLOSING_BLOCKQUOTE_TAG_PATTERN, "</blockquote>");
 402  160
     s = replace(s, BR_TAG_PATTERN, "<br />");
 403  160
     s = replace(s, OPENING_P_TAG_PATTERN, "<p>");
 404  160
     s = replace(s, CLOSING_P_TAG_PATTERN, "</p>");
 405  160
     s = replace(s, OPENING_PRE_TAG_PATTERN, "<pre>");
 406  160
     s = replace(s, CLOSING_PRE_TAG_PATTERN, "</pre>");
 407  160
     s = replace(s, OPENING_UL_TAG_PATTERN, "<ul>");
 408  160
     s = replace(s, CLOSING_UL_TAG_PATTERN, "</ul>");
 409  160
     s = replace(s, OPENING_OL_TAG_PATTERN, "<ol>");
 410  160
     s = replace(s, CLOSING_OL_TAG_PATTERN, "</ol>");
 411  160
     s = replace(s, OPENING_LI_TAG_PATTERN, "<li>");
 412  160
     s = replace(s, CLOSING_LI_TAG_PATTERN, "</li>");
 413  160
     s = replace(s, OPENING_SUP_TAG_PATTERN, "<sup>");
 414  160
     s = replace(s, CLOSING_SUP_TAG_PATTERN, "</sup>");
 415  160
     s = replace(s, OPENING_SUB_TAG_PATTERN, "<sub>");
 416  160
     s = replace(s, CLOSING_SUB_TAG_PATTERN, "</sub>");
 417  
 
 418  
     // HTTP links - remove all attributes other than href
 419  160
     s = replace(s, CLOSING_A_TAG_PATTERN, "</a>");
 420  160
     Matcher m = OPENING_A_TAG_PATTERN.matcher(s);
 421  
     // Use a single buffer for efficiency
 422  160
     StringBuffer buffer = new StringBuffer();
 423  
     // The position in the original string that we are up to
 424  160
     int position = 0;
 425  204
     while (m.find()) {
 426  44
       int start = m.start();
 427  44
       int end = m.end();
 428  44
       buffer.append(s.subSequence(position, start)).append("<a href=");
 429  44
       String link = s.substring(start, end);
 430  44
       int startOfHrefIndex = link.indexOf("href=&quot;");
 431  44
       if (startOfHrefIndex > -1) {
 432  32
         int startOfHrefValue = startOfHrefIndex + "href=&quot;".length();
 433  32
         int endOfHrefIndex = link.indexOf("&quot;", startOfHrefValue);
 434  32
         buffer.append("\"").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("\"");
 435  32
       } else {
 436  12
         startOfHrefIndex = link.indexOf("href='");
 437  12
         if (startOfHrefIndex > -1) {
 438  12
           int startOfHrefValue = startOfHrefIndex + "href='".length();
 439  12
           int endOfHrefIndex = link.indexOf("'", startOfHrefIndex+"href='".length());
 440  12
           buffer.append("'").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("'");
 441  
         }
 442  
       }
 443  44
       buffer.append(">");
 444  44
       position = end;
 445  44
     }
 446  
     // If position is still 0 there were no matches, so don't do anything
 447  160
     if (position > 0) {
 448  36
       buffer.append(s.subSequence(position, s.length()));
 449  36
       s = buffer.toString();
 450  
     }
 451  
 
 452  
     // escaped angle brackets and other allowed entities
 453  160
     s = s.replaceAll("&amp;lt;", "&lt;");
 454  160
     s = s.replaceAll("&amp;gt;", "&gt;");
 455  160
     s = s.replaceAll("&amp;([#a-zA-Z0-9]{1,}?);", "&$1;");
 456  
     
 457  160
     return s;
 458  
   }
 459  
 
 460  
   private static String replace(String string, Pattern pattern, String replacement) {
 461  4160
     Matcher m = pattern.matcher(string);
 462  4160
     return m.replaceAll(replacement);
 463  
   }
 464  
 
 465  
   /**
 466  
    * Filters out newline characters.
 467  
    *
 468  
    * @param s   the String to filter
 469  
    * @return  the filtered String
 470  
    */
 471  
   public static String filterNewlines(String s) {
 472  
 
 473  16
     if (s == null) {
 474  4
       return null;
 475  
     }
 476  
 
 477  12
     StringBuffer buf = new StringBuffer(s.length());
 478  
 
 479  
     // loop through every character and replace if necessary
 480  12
     int length = s.length();
 481  164
     for (int i = 0; i < length; i++) {
 482  152
       switch (s.charAt(i)) {
 483  
         case '\r':
 484  8
           break;
 485  
         default :
 486  144
           buf.append(s.charAt(i));
 487  
       }
 488  
     }
 489  
 
 490  12
     return buf.toString();
 491  
   }
 492  
 
 493  
   /**
 494  
    * Filters out all HTML tags.
 495  
    *
 496  
    * @param s   the String to filter
 497  
    * @return    the filtered String
 498  
    */
 499  
   public static String filterHTML(String s) {
 500  2256
     if (s == null) {
 501  132
       return null;
 502  
     }
 503  
 
 504  2124
     s = s.replaceAll("&lt;", "");
 505  2124
     s = s.replaceAll("&gt;", "");
 506  2124
     s = s.replaceAll("&nbsp;", "");
 507  2124
     s = s.replaceAll("(?s)<!--.*?-->", "");
 508  2124
     return s.replaceAll("(?s)<.*?>", "");
 509  
   }
 510  
 
 511  
   public static String truncate(String s) {
 512  168
     return truncate(s, MAX_CONTENT_LENGTH);
 513  
   }
 514  
 
 515  
   public static String truncate(String s, int maxLength) {
 516  168
     String content = StringUtils.filterHTML(s);
 517  
 
 518  
     // then truncate, if necessary
 519  168
     if (content == null) {
 520  4
       return "";
 521  
     } else {
 522  164
       StringBuffer buf = new StringBuffer();
 523  
 
 524  164
       String words[] = content.split("\\s");
 525  508
       for (int i = 0; i < words.length; i++) {
 526  360
         if (buf.length() + words[i].length() > maxLength) {
 527  
           // truncate here
 528  4
           buf.append("...");
 529  4
           return buf.toString();
 530  356
         } else if (words[i].length() > MAX_WORD_LENGTH) {
 531  
           // truncate here
 532  12
           buf.append(words[i].substring(0, MAX_WORD_LENGTH));
 533  12
           buf.append("...");
 534  12
           return buf.toString();
 535  
         } else {
 536  344
           buf.append(words[i]);
 537  344
           if ((i+1) < words.length) {
 538  196
             buf.append(" ");
 539  
           }
 540  
         }
 541  
       }
 542  
 
 543  148
       return buf.toString();
 544  
     }
 545  
   }
 546  
 
 547  
   public static String stripScriptTags(String html) {
 548  20
     if (html == null) {
 549  0
       return html;
 550  
     }
 551  
 
 552  20
     html = html.replaceAll("<script.*?>.*?</script.*?>", "");
 553  20
     html = html.replaceAll("<script.*?/>", "");
 554  20
     return html;
 555  
   }
 556  
 
 557  
 
 558  
   public static String unescapeHTMLEntities(String source) {
 559  24
      Iterator<String> it = htmlEntities.keySet().iterator(); 
 560  
          
 561  5976
          while(it.hasNext()) { 
 562  
                  
 563  5952
                  String key = it.next(); 
 564  5952
                  String val = htmlEntities.get(key); 
 565  5952
                  source = source.replaceAll(key, val);
 566  5952
          } 
 567  24
      return source;
 568  
   }
 569  
 
 570  
   public static String validateUrl(String url) {
 571  1948
     if (url == null || url.length() == 0) {
 572  232
       return null;
 573  
     }
 574  
     // whitelist, don't blacklist.
 575  1716
     for (String scheme : allowedSchemes) {
 576  4088
       if (url.startsWith(scheme)) {
 577  1400
         return url;
 578  
       }
 579  
     }
 580  316
     return "http://" + url;
 581  
   }
 582  
 
 583  
 }