001 /*
002 *
003 * $Revision: 15222 $ $Date: 2009-05-19 12:25:55 +0200 (Tue, 19 May 2009) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.common;
025
026 import java.util.StringTokenizer;
027 import java.util.regex.Pattern;
028
029 import org.apache.log4j.Logger;
030
031 import com.ibm.icu.text.Normalizer;
032
033 /**
034 * This class implements only static methods to normalize text values. Rules
035 * written as x>u .You can configure this normalization with three property
036 * values<br>
037 * <ul>
038 * <li>MCR.Metadata.Normalize.AddRule - add more rules to the default rule</li>
039 * <li>MCR.Metadata.Normalize.SetRule - replace the default rule</li>
040 * <li>MCR.Metadata.Normalize.DiacriticRule true (standard) | false - first
041 * rule, remove diacritics from letters <br>
042 * Here you can see how decomposition works:
043 * http://www.icu-project.org/apiref/icu4j/com/ibm/icu/text/Normalizer.html <br>
044 * These diacritics will be removed from letters when property is true: <br>
045 * "\u0301", // &#769; (0xcc 0x81 = 204 129) COMBINING ACUTE ACCENT <br>
046 * "\u0300", // &#768; (0xcc 0x80 = 204 128) COMBINING GRAVE ACCENT <br>
047 * "\u0302", // &#770; (0xcc 0x82 = 204 130) COMBINING CIRCUMFLEX ACCENT
048 * <br>
049 * "\u0307", // &#775; (0xcc 0x87 = 204 135) COMBINING DOT ABOVE <br>
050 * "\u0308", // &#776; (0xcc 0x88 = 204 136) COMBINING DIAERESIS <br>
051 * "\u0306", // &#774; (0xcc 0x86 = 204 134) COMBINING BREVE <br>
052 * "\u030B", // &#779; (0xcc 0x8b = 204 139) COMBINING DOUBLE ACUTE ACCENT
053 * <br>
054 * "\u030C", // &#780; (0xcc 0x8c = 204 140) COMBINING CARON (Hacek) <br>
055 * "\u030A", // &#778; (0xcc 0x8a = 204 138) COMBINING RING ABOVE <br>
056 * "\u0304", // &#772; (0xcc 0x84 = 204 132) COMBINING MACRON <br>
057 * "\u032E", // &#814; (0xcc 0xae = 204 174) COMBINING BREVE BELOW <br>
058 * "\u0328", // &#808; (0xcc 0xa8 = 204 168) COMBINING OGONEK <br>
059 * "\u0327", // &#807; (0xcc 0xa7 = 204 167) COMBINING CEDILLA <br>
060 * "\u0323", // &#803; (0xcc 0xa3 = 204 163) COMBINING DOT BELOW <br>
061 * "\u0338", // &#824; (0xcc 0xb8 = 204 184) COMBINING LONG SOLIDUS OVERLAY
062 * <br>
063 * "\u0336", // &#822; (0xcc 0xb6 = 204 182) COMBINING LONG STROKE OVERLAY
064 * <br>
065 * "\u0332", // &#818; (0xcc 0xb2 = 204 178) COMBINING LOW LINE <br>
066 * "\u0303"};// &#771; (0xcc 0x83 = 204 131) COMBINING TILDE <br>
067 * </li>
068 * </ul>
069 *
070 * @author Frank L\u00fctzenkirchen
071 * @author Thomas Scheffler (yagee)
072 * @author Jens Kupferschmidt
073 * @author Harald Richter
074 *
075 * @version $Revision: 15222 $ $Date: 2009-05-19 12:25:55 +0200 (Tue, 19 May 2009) $
076 */
077 public class MCRNormalizer {
078 static Logger logger = Logger.getLogger(MCRNormalizer.class);
079
080 /** List of characters that will be replaced */
081 private static String rules = "\u00DF>ss \u00E4>ae \u00C4>ae \u00F6>oe \u00D6>oe \u00FC>ue \u00DC>ue"; // sz ae Ae oe Oe ue Ue
082
083 private static Pattern[] patterns;
084
085 private static String[] replace;
086
087 private static MCRConfiguration config = MCRConfiguration.instance();
088
089 private static boolean normalize = config.getBoolean("MCR.Metadata.Normalize", true);
090
091 private static String addRule = config.getString("MCR.Metadata.Normalize.AddRule", "");
092
093 private static String setRule = config.getString("MCR.Metadata.Normalize.SetRule", "");
094
095 private static boolean diacriticRule = config.getBoolean("MCR.Metadata.Normalize.DiacriticRule", true);
096
097 private static boolean useRuleFirst = config.getBoolean("MCR.Metadata.Normalize.UseRuleFirst", false);
098
099 static {
100 if ((setRule != null) && (setRule.trim().length() != 0)) {
101 rules = setRule;
102 } else {
103 if ((addRule != null) && (addRule.trim().length() != 0)) {
104 rules = rules + " " + addRule;
105 }
106
107 }
108 StringTokenizer st = new StringTokenizer(rules, "> ");
109 int numPatterns = st.countTokens() / 2;
110
111 patterns = new Pattern[numPatterns];
112 replace = new String[numPatterns];
113
114 for (int i = 0; i < numPatterns; i++) {
115 patterns[i] = Pattern.compile(st.nextToken());
116 replace[i] = st.nextToken();
117 logger.debug("normalize -->" + patterns[i] + " to -->" + replace[i]);
118 }
119 }
120
121 /**
122 * This method replaces umlauts and other special characters of languages
123 * like german to normalized lowercase a-z characters.
124 *
125 * @param in
126 * the String to be normalized
127 * @return the normalized String in lower case.
128 */
129 public static final String normalizeString(String in) {
130 String temp = in;
131 // use private rules first
132 if (useRuleFirst) {
133 temp = normalizeString(temp, normalize);
134 }
135
136 // replace letters with diacritics with therr corresponding letter
137 // lower letter umlat a is replaces with a
138 if (diacriticRule) {
139 temp = Normalizer.decompose(temp, false);
140
141 String[] dia = { "\u0301", // &#769; (0xcc 0x81 = 204 129)
142 // COMBINING ACUTE ACCENT
143 "\u0300", // &#768; (0xcc 0x80 = 204 128)
144 // COMBINING GRAVE ACCENT
145 "\u0302", // &#770; (0xcc 0x82 = 204 130)
146 // COMBINING CIRCUMFLEX ACCENT
147 "\u0307", // &#775; (0xcc 0x87 = 204 135)
148 // COMBINING DOT ABOVE
149 "\u0308", // &#776; (0xcc 0x88 = 204 136)
150 // COMBINING DIAERESIS
151 "\u0306", // &#774; (0xcc 0x86 = 204 134)
152 // COMBINING BREVE
153 "\u030B", // &#779; (0xcc 0x8b = 204 139)
154 // COMBINING DOUBLE ACUTE ACCENT
155 "\u030C", // &#780; (0xcc 0x8c = 204 140)
156 // COMBINING CARON (Hacek)
157 "\u030A", // &#778; (0xcc 0x8a = 204 138)
158 // COMBINING RING ABOVE
159 "\u0304", // &#772; (0xcc 0x84 = 204 132)
160 // COMBINING MACRON
161 "\u032E", // &#814; (0xcc 0xae = 204 174)
162 // COMBINING BREVE BELOW
163 "\u0328", // &#808; (0xcc 0xa8 = 204 168)
164 // COMBINING OGONEK
165 "\u0327", // &#807; (0xcc 0xa7 = 204 167)
166 // COMBINING CEDILLA
167 "\u0323", // &#803; (0xcc 0xa3 = 204 163)
168 // COMBINING DOT BELOW
169 "\u0338", // &#824; (0xcc 0xb8 = 204 184)
170 // COMBINING LONG SOLIDUS OVERLAY
171 "\u0336", // &#822; (0xcc 0xb6 = 204 182)
172 // COMBINING LONG STROKE OVERLAY
173 "\u0332", // &#818; (0xcc 0xb2 = 204 178)
174 // COMBINING LOW LINE
175 "\u0303" };// &#771; (0xcc 0x83 = 204 131)
176 // COMBINING TILDE
177
178 for (int i = 0; i < dia.length; i++) {
179 temp = MCRUtils.replaceString(temp, dia[i], "");
180 }
181 }
182
183 if (!useRuleFirst) {
184 temp = normalizeString(temp, normalize);
185 }
186
187 return temp;
188 }
189
190 public static final String normalizeString(String in, boolean reallyNormalize) {
191 if ((in == null) || (in.trim().length() == 0))
192 return "";
193
194 if (!reallyNormalize)
195 return in;
196
197 // in = in.toLowerCase(Locale.GERMANY).trim();
198 in = in.toLowerCase().trim();
199
200 for (int i = 0; i < patterns.length; i++)
201 in = patterns[i].matcher(in).replaceAll(replace[i]);
202
203 return in;
204 }
205
206 /**
207 * Activates or deactivates normalizing. Used in miless software to make
208 * indexing of scorm and searching possible
209 *
210 * @param value
211 * true normalize strings false do not normalize strings
212 *
213 */
214 public static final void setDoNormalize(boolean value) {
215 normalize = value;
216 }
217 }