001 // Copyright 2004, 2005 The Apache Software Foundation
002 //
003 // Licensed under the Apache License, Version 2.0 (the "License");
004 // you may not use this file except in compliance with the License.
005 // You may obtain a copy of the License at
006 //
007 // http://www.apache.org/licenses/LICENSE-2.0
008 //
009 // Unless required by applicable law or agreed to in writing, software
010 // distributed under the License is distributed on an "AS IS" BASIS,
011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012 // See the License for the specific language governing permissions and
013 // limitations under the License.
014
015 package org.apache.tapestry.util.text;
016
017 /**
018 * An object that encodes a character according to rules of the HTML
019 * specification, so that it will be properly parsed by a browser irrespectively
020 * of the character encoding used in the HTML output.
021 *
022 * @author mb
023 * @since 4.0
024 */
025 public class MarkupCharacterTranslator implements ICharacterTranslator {
026
027 private static final String SAFE_CHARACTERS = "01234567890"
028 + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
029 + "\t\n\r !#$%'()*+,-./:;=?@[\\]^_`{|}~";
030
031 private static final String[][] ENTITIES = {
032 {"\"", """},
033 {"<", "<"}, {">", ">"}, {"&", "&"}
034 };
035
036 private static final ICharacterMatcher SAFE_MATCHER = new AsciiCharacterMatcher(SAFE_CHARACTERS);
037 private static final ICharacterTranslator ENTITY_TRANSLATOR = new AsciiCharacterTranslator(ENTITIES);
038
039 private boolean _encodeNonAscii;
040 private ICharacterMatcher _safeMatcher;
041 private ICharacterTranslator _entityTranslator;
042
043 public MarkupCharacterTranslator()
044 {
045 this(true);
046 }
047
048 public MarkupCharacterTranslator(boolean encodeNonAscii)
049 {
050 this(encodeNonAscii, SAFE_MATCHER, ENTITY_TRANSLATOR);
051 }
052
053 public MarkupCharacterTranslator(boolean encodeNonAscii,
054 ICharacterMatcher safeMatcher, ICharacterTranslator entityTranslator)
055 {
056 _encodeNonAscii = encodeNonAscii;
057 _safeMatcher = safeMatcher;
058 _entityTranslator = entityTranslator;
059 }
060
061 public MarkupCharacterTranslator(boolean encodeNonAscii,
062 String safeCharacters, String[][] entities)
063 {
064 _encodeNonAscii = encodeNonAscii;
065 _safeMatcher = new AsciiCharacterMatcher(safeCharacters);
066 _entityTranslator = new AsciiCharacterTranslator(entities);
067 }
068
069 /**
070 * @see ICharacterTranslator#translate(char)
071 */
072 public String translate(char ch)
073 {
074 // IE and Firefox do not handle characters between 128 and 159 well,
075 // so they have to be quoted as well
076 if (ch >= 160 && !_encodeNonAscii)
077 return null;
078
079 if (_safeMatcher.matches(ch))
080 return null;
081
082 String entity = _entityTranslator.translate(ch);
083 if (entity != null)
084 return entity;
085
086 // needs to use a NumberFormat here to be fully compliant,
087 // but this is accepted fine by the browsers
088 return "&#" + (int) ch + ";";
089 }
090 }