001    // Copyright 2004, 2005 The Apache Software Foundation
002    //
003    // Licensed under the Apache License, Version 2.0 (the "License");
004    // you may not use this file except in compliance with the License.
005    // You may obtain a copy of the License at
006    //
007    //     http://www.apache.org/licenses/LICENSE-2.0
008    //
009    // Unless required by applicable law or agreed to in writing, software
010    // distributed under the License is distributed on an "AS IS" BASIS,
011    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012    // See the License for the specific language governing permissions and
013    // limitations under the License.
014    
015    package org.apache.tapestry.util.text;
016    
017    /**
018     * An object that encodes a character according to rules of the HTML
019     * specification, so that it will be properly parsed by a browser irrespectively
020     * of the character encoding used in the HTML output.
021     *
022     * @author mb
023     * @since 4.0
024     */
025    public class MarkupCharacterTranslator implements ICharacterTranslator {
026    
027        private static final String SAFE_CHARACTERS = "01234567890"
028                                                      + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
029                                                      + "\t\n\r !#$%'()*+,-./:;=?@[\\]^_`{|}~";
030    
031        private static final String[][] ENTITIES = {
032                {"\"", """},
033                {"<", "&lt;"}, {">", "&gt;"}, {"&", "&amp;"}
034        };
035    
036        private static final ICharacterMatcher SAFE_MATCHER = new AsciiCharacterMatcher(SAFE_CHARACTERS);
037        private static final ICharacterTranslator ENTITY_TRANSLATOR = new AsciiCharacterTranslator(ENTITIES);
038    
039        private boolean _encodeNonAscii;
040        private ICharacterMatcher _safeMatcher;
041        private ICharacterTranslator _entityTranslator;
042    
043        public MarkupCharacterTranslator()
044        {
045            this(true);
046        }
047    
048        public MarkupCharacterTranslator(boolean encodeNonAscii)
049        {
050            this(encodeNonAscii, SAFE_MATCHER, ENTITY_TRANSLATOR);
051        }
052    
053        public MarkupCharacterTranslator(boolean encodeNonAscii,
054                                         ICharacterMatcher safeMatcher, ICharacterTranslator entityTranslator)
055        {
056            _encodeNonAscii = encodeNonAscii;
057            _safeMatcher = safeMatcher;
058            _entityTranslator = entityTranslator;
059        }
060    
061        public MarkupCharacterTranslator(boolean encodeNonAscii,
062                                         String safeCharacters, String[][] entities)
063        {
064            _encodeNonAscii = encodeNonAscii;
065            _safeMatcher = new AsciiCharacterMatcher(safeCharacters);
066            _entityTranslator = new AsciiCharacterTranslator(entities);
067        }
068    
069        /**
070         * @see ICharacterTranslator#translate(char)
071         */
072        public String translate(char ch)
073        {
074            // IE and Firefox do not handle characters between 128 and 159 well,
075            // so they have to be quoted as well
076            if (ch >= 160 && !_encodeNonAscii)
077                return null;
078    
079            if (_safeMatcher.matches(ch))
080                return null;
081    
082            String entity = _entityTranslator.translate(ch);
083            if (entity != null)
084                return entity;
085    
086            // needs to use a NumberFormat here to be fully compliant,
087            // but this is accepted fine by the browsers
088            return "&#" + (int) ch + ";";
089        }
090    }