001 // Copyright 2004, 2005 The Apache Software Foundation 002 // 003 // Licensed under the Apache License, Version 2.0 (the "License"); 004 // you may not use this file except in compliance with the License. 005 // You may obtain a copy of the License at 006 // 007 // http://www.apache.org/licenses/LICENSE-2.0 008 // 009 // Unless required by applicable law or agreed to in writing, software 010 // distributed under the License is distributed on an "AS IS" BASIS, 011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 // See the License for the specific language governing permissions and 013 // limitations under the License. 014 015 package org.apache.tapestry.util.text; 016 017 /** 018 * An object that encodes a character according to rules of the HTML 019 * specification, so that it will be properly parsed by a browser irrespectively 020 * of the character encoding used in the HTML output. 021 * 022 * @author mb 023 * @since 4.0 024 */ 025 public class MarkupCharacterTranslator implements ICharacterTranslator { 026 027 private static final String SAFE_CHARACTERS = "01234567890" 028 + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 029 + "\t\n\r !#$%'()*+,-./:;=?@[\\]^_`{|}~"; 030 031 private static final String[][] ENTITIES = { 032 {"\"", """}, 033 {"<", "<"}, {">", ">"}, {"&", "&"} 034 }; 035 036 private static final ICharacterMatcher SAFE_MATCHER = new AsciiCharacterMatcher(SAFE_CHARACTERS); 037 private static final ICharacterTranslator ENTITY_TRANSLATOR = new AsciiCharacterTranslator(ENTITIES); 038 039 private boolean _encodeNonAscii; 040 private ICharacterMatcher _safeMatcher; 041 private ICharacterTranslator _entityTranslator; 042 043 public MarkupCharacterTranslator() 044 { 045 this(true); 046 } 047 048 public MarkupCharacterTranslator(boolean encodeNonAscii) 049 { 050 this(encodeNonAscii, SAFE_MATCHER, ENTITY_TRANSLATOR); 051 } 052 053 public MarkupCharacterTranslator(boolean encodeNonAscii, 054 ICharacterMatcher safeMatcher, ICharacterTranslator entityTranslator) 055 { 056 _encodeNonAscii = encodeNonAscii; 057 _safeMatcher = safeMatcher; 058 _entityTranslator = entityTranslator; 059 } 060 061 public MarkupCharacterTranslator(boolean encodeNonAscii, 062 String safeCharacters, String[][] entities) 063 { 064 _encodeNonAscii = encodeNonAscii; 065 _safeMatcher = new AsciiCharacterMatcher(safeCharacters); 066 _entityTranslator = new AsciiCharacterTranslator(entities); 067 } 068 069 /** 070 * @see ICharacterTranslator#translate(char) 071 */ 072 public String translate(char ch) 073 { 074 // IE and Firefox do not handle characters between 128 and 159 well, 075 // so they have to be quoted as well 076 if (ch >= 160 && !_encodeNonAscii) 077 return null; 078 079 if (_safeMatcher.matches(ch)) 080 return null; 081 082 String entity = _entityTranslator.translate(ch); 083 if (entity != null) 084 return entity; 085 086 // needs to use a NumberFormat here to be fully compliant, 087 // but this is accepted fine by the browsers 088 return "&#" + (int) ch + ";"; 089 } 090 }