001    /*
002     * 
003     * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.common;
025    
026    import java.util.*;
027    
028    import org.apache.log4j.Logger;
029    
030    /**
031     * Detects the language of a given text string by 
032     * looking for typical words and word endings for each language.
033     * German, englisch and french are currently supported.
034     * 
035     * @author Frank Lützenkirchen
036     * @version $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
037     */
038    public class MCRLanguageDetector {
039        private static Logger LOGGER = Logger.getLogger(MCRLanguageDetector.class);
040    
041        private static Properties words = new Properties();
042    
043        private static Properties endings = new Properties();
044    
045        static {
046            words.put("de", "als am auch auf aus bei bis das dem den der deren derer des dessen die dies diese dieser dieses ein eine einer eines einem für hat im ist mit sich sie über und vom von vor wie zu zum zur");
047            words.put("en", "a and are as at do for from has have how its like new of on or the their through to with you your");
048            words.put("fr", "la le les un une des, à aux de pour par sur comme aussi jusqu'à jusqu'aux quel quels quelles laquelle lequel lesquelles lesquelles auxquels auxquelles avec sans ont sont duquel desquels desquelles quand");
049    
050            endings.put("en", "ar ble cal ce ced ed ent ic ies ing ive ness our ous ons ral th ure y");
051            endings.put("de", "ag chen gen ger iche icht ig ige isch ische ischen kar ker keit ler mus nen ner rie rer ter ten trie tz ung yse");
052            endings.put("fr", "é, és, ée, ées, euse, euses, ème, euil, asme, isme, aux");
053        }
054    
055        private static int buildScore(String text, String lang, String wordList, String endings) {
056            text = text.toLowerCase().trim();
057            text = text.replace(',', ' ').replace('-', ' ').replace('/', ' ');
058            text = " " + text + " ";
059    
060            int score = 0;
061    
062            StringTokenizer st = new StringTokenizer(wordList, " ");
063            while (st.hasMoreTokens()) {
064                String word = st.nextToken();
065                int pos = 0;
066                while ((pos = text.indexOf(" " + word + " ", pos)) >= 0) {
067                    score += 2;
068                    pos = Math.min(pos + word.length() + 1, text.length());
069                }
070            }
071    
072            st = new StringTokenizer(endings, " ");
073            while (st.hasMoreTokens()) {
074                String ending = st.nextToken();
075    
076                if (text.indexOf(ending + " ") >= 0)
077                    score += 1;
078                int pos = 0;
079                while ((pos = text.indexOf(ending + " ", pos)) >= 0) {
080                    score += 1;
081                    pos = Math.min(pos + ending.length() + 1, text.length());
082                }
083            }
084    
085            LOGGER.debug("Score " + lang + " = " + score);
086            return score;
087        }
088    
089        /**
090         * Detects the language of a given text string.
091         * 
092         * @param text the text string
093         * @return the language code: de, en, fr or null
094         */
095        public static String detectLanguage(String text) {
096            LOGGER.debug("Detecting language of [" + text + "]");
097    
098            String bestLanguage = null;
099            int bestScore = 0;
100    
101            Enumeration languages = words.keys();
102            while (languages.hasMoreElements()) {
103                String language = (String) (languages.nextElement());
104                String wordList = words.getProperty(language);
105                String endingList = endings.getProperty(language);
106    
107                int score = buildScore(text, language, wordList, endingList);
108                if (score > bestScore) {
109                    bestLanguage = language;
110                    bestScore = score;
111                }
112            }
113    
114            LOGGER.debug("Detected language = " + bestLanguage);
115            return bestLanguage;
116        }
117    }