001 /*
002 *
003 * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.common;
025
026 import java.util.*;
027
028 import org.apache.log4j.Logger;
029
030 /**
031 * Detects the language of a given text string by
032 * looking for typical words and word endings for each language.
033 * German, englisch and french are currently supported.
034 *
035 * @author Frank Lützenkirchen
036 * @version $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
037 */
038 public class MCRLanguageDetector {
039 private static Logger LOGGER = Logger.getLogger(MCRLanguageDetector.class);
040
041 private static Properties words = new Properties();
042
043 private static Properties endings = new Properties();
044
045 static {
046 words.put("de", "als am auch auf aus bei bis das dem den der deren derer des dessen die dies diese dieser dieses ein eine einer eines einem für hat im ist mit sich sie über und vom von vor wie zu zum zur");
047 words.put("en", "a and are as at do for from has have how its like new of on or the their through to with you your");
048 words.put("fr", "la le les un une des, à aux de pour par sur comme aussi jusqu'à jusqu'aux quel quels quelles laquelle lequel lesquelles lesquelles auxquels auxquelles avec sans ont sont duquel desquels desquelles quand");
049
050 endings.put("en", "ar ble cal ce ced ed ent ic ies ing ive ness our ous ons ral th ure y");
051 endings.put("de", "ag chen gen ger iche icht ig ige isch ische ischen kar ker keit ler mus nen ner rie rer ter ten trie tz ung yse");
052 endings.put("fr", "é, és, ée, ées, euse, euses, ème, euil, asme, isme, aux");
053 }
054
055 private static int buildScore(String text, String lang, String wordList, String endings) {
056 text = text.toLowerCase().trim();
057 text = text.replace(',', ' ').replace('-', ' ').replace('/', ' ');
058 text = " " + text + " ";
059
060 int score = 0;
061
062 StringTokenizer st = new StringTokenizer(wordList, " ");
063 while (st.hasMoreTokens()) {
064 String word = st.nextToken();
065 int pos = 0;
066 while ((pos = text.indexOf(" " + word + " ", pos)) >= 0) {
067 score += 2;
068 pos = Math.min(pos + word.length() + 1, text.length());
069 }
070 }
071
072 st = new StringTokenizer(endings, " ");
073 while (st.hasMoreTokens()) {
074 String ending = st.nextToken();
075
076 if (text.indexOf(ending + " ") >= 0)
077 score += 1;
078 int pos = 0;
079 while ((pos = text.indexOf(ending + " ", pos)) >= 0) {
080 score += 1;
081 pos = Math.min(pos + ending.length() + 1, text.length());
082 }
083 }
084
085 LOGGER.debug("Score " + lang + " = " + score);
086 return score;
087 }
088
089 /**
090 * Detects the language of a given text string.
091 *
092 * @param text the text string
093 * @return the language code: de, en, fr or null
094 */
095 public static String detectLanguage(String text) {
096 LOGGER.debug("Detecting language of [" + text + "]");
097
098 String bestLanguage = null;
099 int bestScore = 0;
100
101 Enumeration languages = words.keys();
102 while (languages.hasMoreElements()) {
103 String language = (String) (languages.nextElement());
104 String wordList = words.getProperty(language);
105 String endingList = endings.getProperty(language);
106
107 int score = buildScore(text, language, wordList, endingList);
108 if (score > bestScore) {
109 bestLanguage = language;
110 bestScore = score;
111 }
112 }
113
114 LOGGER.debug("Detected language = " + bestLanguage);
115 return bestLanguage;
116 }
117 }