001    /*
002     * 
003     * $Revision: 15222 $ $Date: 2009-05-19 12:25:55 +0200 (Tue, 19 May 2009) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.common;
025    
026    import java.util.StringTokenizer;
027    import java.util.regex.Pattern;
028    
029    import org.apache.log4j.Logger;
030    
031    import com.ibm.icu.text.Normalizer;
032    
033    /**
034     * This class implements only static methods to normalize text values. Rules
035     * written as x>u .You can configure this normalization with three property
036     * values<br>
037     * <ul>
038     * <li>MCR.Metadata.Normalize.AddRule - add more rules to the default rule</li>
039     * <li>MCR.Metadata.Normalize.SetRule - replace the default rule</li>
040     * <li>MCR.Metadata.Normalize.DiacriticRule true (standard) | false - first
041     * rule, remove diacritics from letters <br>
042     * Here you can see how decomposition works:
043     * http://www.icu-project.org/apiref/icu4j/com/ibm/icu/text/Normalizer.html <br>
044     * These diacritics will be removed from letters when property is true: <br>
045     * "\u0301", // &amp;#769; (0xcc 0x81 = 204 129) COMBINING ACUTE ACCENT <br>
046     * "\u0300", // &amp;#768; (0xcc 0x80 = 204 128) COMBINING GRAVE ACCENT <br>
047     * "\u0302", // &amp;#770; (0xcc 0x82 = 204 130) COMBINING CIRCUMFLEX ACCENT
048     * <br>
049     * "\u0307", // &amp;#775; (0xcc 0x87 = 204 135) COMBINING DOT ABOVE <br>
050     * "\u0308", // &amp;#776; (0xcc 0x88 = 204 136) COMBINING DIAERESIS <br>
051     * "\u0306", // &amp;#774; (0xcc 0x86 = 204 134) COMBINING BREVE <br>
052     * "\u030B", // &amp;#779; (0xcc 0x8b = 204 139) COMBINING DOUBLE ACUTE ACCENT
053     * <br>
054     * "\u030C", // &amp;#780; (0xcc 0x8c = 204 140) COMBINING CARON (Hacek) <br>
055     * "\u030A", // &amp;#778; (0xcc 0x8a = 204 138) COMBINING RING ABOVE <br>
056     * "\u0304", // &amp;#772; (0xcc 0x84 = 204 132) COMBINING MACRON <br>
057     * "\u032E", // &amp;#814; (0xcc 0xae = 204 174) COMBINING BREVE BELOW <br>
058     * "\u0328", // &amp;#808; (0xcc 0xa8 = 204 168) COMBINING OGONEK <br>
059     * "\u0327", // &amp;#807; (0xcc 0xa7 = 204 167) COMBINING CEDILLA <br>
060     * "\u0323", // &amp;#803; (0xcc 0xa3 = 204 163) COMBINING DOT BELOW <br>
061     * "\u0338", // &amp;#824; (0xcc 0xb8 = 204 184) COMBINING LONG SOLIDUS OVERLAY
062     * <br>
063     * "\u0336", // &amp;#822; (0xcc 0xb6 = 204 182) COMBINING LONG STROKE OVERLAY
064     * <br>
065     * "\u0332", // &amp;#818; (0xcc 0xb2 = 204 178) COMBINING LOW LINE <br>
066     * "\u0303"};// &amp;#771; (0xcc 0x83 = 204 131) COMBINING TILDE <br>
067     * </li>
068     * </ul>
069     * 
070     * @author Frank L\u00fctzenkirchen
071     * @author Thomas Scheffler (yagee)
072     * @author Jens Kupferschmidt
073     * @author Harald Richter
074     * 
075     * @version $Revision: 15222 $ $Date: 2009-05-19 12:25:55 +0200 (Tue, 19 May 2009) $
076     */
077    public class MCRNormalizer {
078        static Logger logger = Logger.getLogger(MCRNormalizer.class);
079    
080        /** List of characters that will be replaced */
081        private static String rules = "\u00DF>ss \u00E4>ae \u00C4>ae \u00F6>oe \u00D6>oe \u00FC>ue \u00DC>ue"; // sz ae Ae oe Oe ue Ue
082    
083        private static Pattern[] patterns;
084    
085        private static String[] replace;
086    
087        private static MCRConfiguration config = MCRConfiguration.instance();
088    
089        private static boolean normalize = config.getBoolean("MCR.Metadata.Normalize", true);
090    
091        private static String addRule = config.getString("MCR.Metadata.Normalize.AddRule", "");
092    
093        private static String setRule = config.getString("MCR.Metadata.Normalize.SetRule", "");
094    
095        private static boolean diacriticRule = config.getBoolean("MCR.Metadata.Normalize.DiacriticRule", true);
096    
097        private static boolean useRuleFirst = config.getBoolean("MCR.Metadata.Normalize.UseRuleFirst", false);
098    
099        static {
100            if ((setRule != null) && (setRule.trim().length() != 0)) {
101                rules = setRule;
102            } else {
103                if ((addRule != null) && (addRule.trim().length() != 0)) {
104                    rules = rules + " " + addRule;
105                }
106    
107            }
108            StringTokenizer st = new StringTokenizer(rules, "> ");
109            int numPatterns = st.countTokens() / 2;
110    
111            patterns = new Pattern[numPatterns];
112            replace = new String[numPatterns];
113    
114            for (int i = 0; i < numPatterns; i++) {
115                patterns[i] = Pattern.compile(st.nextToken());
116                replace[i] = st.nextToken();
117                logger.debug("normalize -->" + patterns[i] + " to -->" + replace[i]);
118            }
119        }
120    
121        /**
122         * This method replaces umlauts and other special characters of languages
123         * like german to normalized lowercase a-z characters.
124         * 
125         * @param in
126         *            the String to be normalized
127         * @return the normalized String in lower case.
128         */
129        public static final String normalizeString(String in) {
130            String temp = in;
131            // use private rules first
132            if (useRuleFirst) {
133                temp = normalizeString(temp, normalize);
134            }
135    
136            // replace letters with diacritics with therr corresponding letter
137            // lower letter umlat a is replaces with a
138            if (diacriticRule) {
139                temp = Normalizer.decompose(temp, false);
140    
141                String[] dia = { "\u0301", // &amp;#769; (0xcc 0x81 = 204 129)
142                        // COMBINING ACUTE ACCENT
143                        "\u0300", // &amp;#768; (0xcc 0x80 = 204 128)
144                        // COMBINING GRAVE ACCENT
145                        "\u0302", // &amp;#770; (0xcc 0x82 = 204 130)
146                        // COMBINING CIRCUMFLEX ACCENT
147                        "\u0307", // &amp;#775; (0xcc 0x87 = 204 135)
148                        // COMBINING DOT ABOVE
149                        "\u0308", // &amp;#776; (0xcc 0x88 = 204 136)
150                        // COMBINING DIAERESIS
151                        "\u0306", // &amp;#774; (0xcc 0x86 = 204 134)
152                        // COMBINING BREVE
153                        "\u030B", // &amp;#779; (0xcc 0x8b = 204 139)
154                        // COMBINING DOUBLE ACUTE ACCENT
155                        "\u030C", // &amp;#780; (0xcc 0x8c = 204 140)
156                        // COMBINING CARON (Hacek)
157                        "\u030A", // &amp;#778; (0xcc 0x8a = 204 138)
158                        // COMBINING RING ABOVE
159                        "\u0304", // &amp;#772; (0xcc 0x84 = 204 132)
160                        // COMBINING MACRON
161                        "\u032E", // &amp;#814; (0xcc 0xae = 204 174)
162                        // COMBINING BREVE BELOW
163                        "\u0328", // &amp;#808; (0xcc 0xa8 = 204 168)
164                        // COMBINING OGONEK
165                        "\u0327", // &amp;#807; (0xcc 0xa7 = 204 167)
166                        // COMBINING CEDILLA
167                        "\u0323", // &amp;#803; (0xcc 0xa3 = 204 163)
168                        // COMBINING DOT BELOW
169                        "\u0338", // &amp;#824; (0xcc 0xb8 = 204 184)
170                        // COMBINING LONG SOLIDUS OVERLAY
171                        "\u0336", // &amp;#822; (0xcc 0xb6 = 204 182)
172                        // COMBINING LONG STROKE OVERLAY
173                        "\u0332", // &amp;#818; (0xcc 0xb2 = 204 178)
174                        // COMBINING LOW LINE
175                        "\u0303" };// &amp;#771; (0xcc 0x83 = 204 131)
176                // COMBINING TILDE
177    
178                for (int i = 0; i < dia.length; i++) {
179                    temp = MCRUtils.replaceString(temp, dia[i], "");
180                }
181            }
182    
183            if (!useRuleFirst) {
184                temp = normalizeString(temp, normalize);
185            }
186    
187            return temp;
188        }
189    
190        public static final String normalizeString(String in, boolean reallyNormalize) {
191            if ((in == null) || (in.trim().length() == 0))
192                return "";
193    
194            if (!reallyNormalize)
195                return in;
196    
197            // in = in.toLowerCase(Locale.GERMANY).trim();
198            in = in.toLowerCase().trim();
199    
200            for (int i = 0; i < patterns.length; i++)
201                in = patterns[i].matcher(in).replaceAll(replace[i]);
202    
203            return in;
204        }
205    
206        /**
207         * Activates or deactivates normalizing. Used in miless software to make
208         * indexing of scorm and searching possible
209         * 
210         * @param value
211         *            true normalize strings false do not normalize strings
212         * 
213         */
214        public static final void setDoNormalize(boolean value) {
215            normalize = value;
216        }
217    }