1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.mycore.common;
20
21 import java.lang.Character.UnicodeScript;
22 import java.util.Enumeration;
23 import java.util.HashMap;
24 import java.util.Locale;
25 import java.util.Map;
26 import java.util.Properties;
27 import java.util.StringTokenizer;
28 import java.util.concurrent.atomic.AtomicInteger;
29
30 import org.apache.logging.log4j.LogManager;
31 import org.apache.logging.log4j.Logger;
32
33
34
35
36
37
38
39
40
41 public class MCRLanguageDetector {
42 private static Logger LOGGER = LogManager.getLogger(MCRLanguageDetector.class);
43
44 private static Properties words = new Properties();
45
46 private static Properties endings = new Properties();
47
48 private static Map<UnicodeScript, String> code2languageCodes = new HashMap<>();
49
50 static {
51 code2languageCodes.put(UnicodeScript.ARABIC, "ar");
52 code2languageCodes.put(UnicodeScript.GREEK, "el");
53 code2languageCodes.put(UnicodeScript.HAN, "zh");
54 code2languageCodes.put(UnicodeScript.HEBREW, "he");
55 code2languageCodes.put(UnicodeScript.HIRAGANA, "ja");
56 code2languageCodes.put(UnicodeScript.KATAKANA, "ja");
57
58 words.put("de", "als am auch auf aus bei bis das dem den der deren derer des dessen"
59 + " die dies diese dieser dieses ein eine einer eines einem für"
60 + " hat im ist mit sich sie über und vom von vor wie zu zum zur");
61 words.put("en",
62 "a and are as at do for from has have how its like new of on or the their through to with you your");
63 words.put("fr", "la le les un une des, à aux de pour par sur comme aussi jusqu'à"
64 + " jusqu'aux quel quels quelles laquelle lequel lesquelles"
65 + " lesquelles auxquels auxquelles avec sans ont sont duquel desquels desquelles quand");
66
67 endings.put("en", "ar ble cal ce ced ed ent ic ies ing ive ness our ous ons ral th ure y");
68 endings.put("de", "ag chen gen ger iche icht ig ige isch ische ischen kar ker"
69 + " keit ler mus nen ner rie rer ter ten trie tz ung yse");
70 endings.put("fr", "é, és, ée, ées, euse, euses, ème, euil, asme, isme, aux");
71 }
72
73 private static int buildScore(String text, String lang, String wordList, String endings) {
74 text = text.toLowerCase(Locale.ROOT).trim();
75 text = text.replace(',', ' ').replace('-', ' ').replace('/', ' ');
76 text = " " + text + " ";
77
78 int score = 0;
79
80 StringTokenizer st = new StringTokenizer(wordList, " ");
81 while (st.hasMoreTokens()) {
82 String word = st.nextToken();
83 int pos = 0;
84 while ((pos = text.indexOf(" " + word + " ", pos)) >= 0) {
85 score += 2;
86 pos = Math.min(pos + word.length() + 1, text.length());
87 }
88 }
89
90 st = new StringTokenizer(endings, " ");
91 while (st.hasMoreTokens()) {
92 String ending = st.nextToken();
93
94 if (text.contains(ending + " ")) {
95 score += 1;
96 }
97 int pos = 0;
98 while ((pos = text.indexOf(ending + " ", pos)) >= 0) {
99 score += 1;
100 pos = Math.min(pos + ending.length() + 1, text.length());
101 }
102 }
103
104 LOGGER.debug("Score {} = {}", lang, score);
105 return score;
106 }
107
108 public static String detectLanguageByCharacter(String text) {
109 if (text == null || text.isEmpty()) {
110 LOGGER.warn("The text for language detection is null or empty");
111 return null;
112 }
113 LOGGER.debug("Detecting language of [{}]", text);
114
115 Map<UnicodeScript, AtomicInteger> scores = new HashMap<>();
116 buildScores(text, scores);
117 UnicodeScript code = getCodeWithMaxScore(scores);
118
119 return code2languageCodes.getOrDefault(code, null);
120 }
121
122 private static void buildScores(String text, Map<UnicodeScript, AtomicInteger> scores) {
123 try {
124 char[] chararray = text.toCharArray();
125 for (int i = 0; i < text.length(); i++) {
126 UnicodeScript code = UnicodeScript.of(Character.codePointAt(chararray, i));
127 increaseScoreFor(scores, code);
128 }
129 } catch (Exception ignored) {
130 }
131 }
132
133 private static void increaseScoreFor(Map<UnicodeScript, AtomicInteger> scores, UnicodeScript code) {
134 scores.computeIfAbsent(code, k -> new AtomicInteger()).incrementAndGet();
135 }
136
137 private static UnicodeScript getCodeWithMaxScore(Map<UnicodeScript, AtomicInteger> scores) {
138 UnicodeScript maxCode = null;
139 int maxScore = 0;
140 for (UnicodeScript code : scores.keySet()) {
141 int score = scores.get(code).get();
142 if (score > maxScore) {
143 maxScore = score;
144 maxCode = code;
145 }
146 }
147 return maxCode;
148 }
149
150
151
152
153
154
155
156 public static String detectLanguage(String text) {
157 LOGGER.debug("Detecting language of [{}]", text);
158
159 String bestLanguage = detectLanguageByCharacter(text);
160
161 if (bestLanguage == null) {
162 int bestScore = 0;
163 Enumeration<Object> languages = words.keys();
164 while (languages.hasMoreElements()) {
165 String language = (String) languages.nextElement();
166 String wordList = words.getProperty(language);
167 String endingList = endings.getProperty(language);
168
169 int score = buildScore(text, language, wordList, endingList);
170 if (score > bestScore) {
171 bestLanguage = language;
172 bestScore = score;
173 }
174 }
175 }
176
177 LOGGER.debug("Detected language = {}", bestLanguage);
178 return bestLanguage;
179 }
180 }