View Javadoc
1   /*
2    * This file is part of ***  M y C o R e  ***
3    * See http://www.mycore.de/ for details.
4    *
5    * MyCoRe is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * MyCoRe is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with MyCoRe.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  
19  package org.mycore.common;
20  
21  import java.lang.Character.UnicodeScript;
22  import java.util.Enumeration;
23  import java.util.HashMap;
24  import java.util.Locale;
25  import java.util.Map;
26  import java.util.Properties;
27  import java.util.StringTokenizer;
28  import java.util.concurrent.atomic.AtomicInteger;
29  
30  import org.apache.logging.log4j.LogManager;
31  import org.apache.logging.log4j.Logger;
32  
33  /**
34   * Detects the language of a given text string by 
35   * looking for typical words and word endings and used characters for each language.
36   * German, english, french, arabic, chinese, japanese, greek and hebrew are currently supported.
37   * 
38   * @author Frank Lützenkirchen
39   * @version $Revision$ $Date$
40   */
41  public class MCRLanguageDetector {
42      private static Logger LOGGER = LogManager.getLogger(MCRLanguageDetector.class);
43  
44      private static Properties words = new Properties();
45  
46      private static Properties endings = new Properties();
47  
48      private static Map<UnicodeScript, String> code2languageCodes = new HashMap<>();
49  
50      static {
51          code2languageCodes.put(UnicodeScript.ARABIC, "ar");
52          code2languageCodes.put(UnicodeScript.GREEK, "el");
53          code2languageCodes.put(UnicodeScript.HAN, "zh");
54          code2languageCodes.put(UnicodeScript.HEBREW, "he");
55          code2languageCodes.put(UnicodeScript.HIRAGANA, "ja");
56          code2languageCodes.put(UnicodeScript.KATAKANA, "ja");
57  
58          words.put("de", "als am auch auf aus bei bis das dem den der deren derer des dessen"
59              + " die dies diese dieser dieses ein eine einer eines einem für"
60              + " hat im ist mit sich sie über und vom von vor wie zu zum zur");
61          words.put("en",
62              "a and are as at do for from has have how its like new of on or the their through to with you your");
63          words.put("fr", "la le les un une des, à aux de pour par sur comme aussi jusqu'à"
64              + " jusqu'aux quel quels quelles laquelle lequel lesquelles"
65              + " lesquelles auxquels auxquelles avec sans ont sont duquel desquels desquelles quand");
66  
67          endings.put("en", "ar ble cal ce ced ed ent ic ies ing ive ness our ous ons ral th ure y");
68          endings.put("de", "ag chen gen ger iche icht ig ige isch ische ischen kar ker"
69              + " keit ler mus nen ner rie rer ter ten trie tz ung yse");
70          endings.put("fr", "é, és, ée, ées, euse, euses, ème, euil, asme, isme, aux");
71      }
72  
73      private static int buildScore(String text, String lang, String wordList, String endings) {
74          text = text.toLowerCase(Locale.ROOT).trim();
75          text = text.replace(',', ' ').replace('-', ' ').replace('/', ' ');
76          text = " " + text + " ";
77  
78          int score = 0;
79  
80          StringTokenizer st = new StringTokenizer(wordList, " ");
81          while (st.hasMoreTokens()) {
82              String word = st.nextToken();
83              int pos = 0;
84              while ((pos = text.indexOf(" " + word + " ", pos)) >= 0) {
85                  score += 2;
86                  pos = Math.min(pos + word.length() + 1, text.length());
87              }
88          }
89  
90          st = new StringTokenizer(endings, " ");
91          while (st.hasMoreTokens()) {
92              String ending = st.nextToken();
93  
94              if (text.contains(ending + " ")) {
95                  score += 1;
96              }
97              int pos = 0;
98              while ((pos = text.indexOf(ending + " ", pos)) >= 0) {
99                  score += 1;
100                 pos = Math.min(pos + ending.length() + 1, text.length());
101             }
102         }
103 
104         LOGGER.debug("Score {} = {}", lang, score);
105         return score;
106     }
107 
108     public static String detectLanguageByCharacter(String text) {
109         if (text == null || text.isEmpty()) {
110             LOGGER.warn("The text for language detection is null or empty");
111             return null;
112         }
113         LOGGER.debug("Detecting language of [{}]", text);
114 
115         Map<UnicodeScript, AtomicInteger> scores = new HashMap<>();
116         buildScores(text, scores);
117         UnicodeScript code = getCodeWithMaxScore(scores);
118 
119         return code2languageCodes.getOrDefault(code, null);
120     }
121 
122     private static void buildScores(String text, Map<UnicodeScript, AtomicInteger> scores) {
123         try {
124             char[] chararray = text.toCharArray();
125             for (int i = 0; i < text.length(); i++) {
126                 UnicodeScript code = UnicodeScript.of(Character.codePointAt(chararray, i));
127                 increaseScoreFor(scores, code);
128             }
129         } catch (Exception ignored) {
130         }
131     }
132 
133     private static void increaseScoreFor(Map<UnicodeScript, AtomicInteger> scores, UnicodeScript code) {
134         scores.computeIfAbsent(code, k -> new AtomicInteger()).incrementAndGet();
135     }
136 
137     private static UnicodeScript getCodeWithMaxScore(Map<UnicodeScript, AtomicInteger> scores) {
138         UnicodeScript maxCode = null;
139         int maxScore = 0;
140         for (UnicodeScript code : scores.keySet()) {
141             int score = scores.get(code).get();
142             if (score > maxScore) {
143                 maxScore = score;
144                 maxCode = code;
145             }
146         }
147         return maxCode;
148     }
149 
150     /**
151      * Detects the language of a given text string.
152      * 
153      * @param text the text string
154      * @return the language code: de, en, fr, ar ,el, zh, he, jp or null
155      */
156     public static String detectLanguage(String text) {
157         LOGGER.debug("Detecting language of [{}]", text);
158 
159         String bestLanguage = detectLanguageByCharacter(text);
160 
161         if (bestLanguage == null) {
162             int bestScore = 0;
163             Enumeration<Object> languages = words.keys();
164             while (languages.hasMoreElements()) {
165                 String language = (String) languages.nextElement();
166                 String wordList = words.getProperty(language);
167                 String endingList = endings.getProperty(language);
168 
169                 int score = buildScore(text, language, wordList, endingList);
170                 if (score > bestScore) {
171                     bestLanguage = language;
172                     bestScore = score;
173                 }
174             }
175         }
176 
177         LOGGER.debug("Detected language = {}", bestLanguage);
178         return bestLanguage;
179     }
180 }