001 /*
002 *
003 * $Revision: 15386 $ $Date: 2009-06-22 09:54:52 +0200 (Mon, 22 Jun 2009) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.backend.lucene;
025
026 import java.io.StringReader;
027 import java.util.Hashtable;
028 import java.util.List;
029 import java.util.StringTokenizer;
030
031 import org.apache.log4j.Logger;
032 import org.apache.lucene.analysis.Analyzer;
033 import org.apache.lucene.analysis.Token;
034 import org.apache.lucene.analysis.TokenStream;
035 import org.apache.lucene.index.Term;
036 import org.apache.lucene.queryParser.QueryParser;
037 import org.apache.lucene.search.BooleanClause;
038 import org.apache.lucene.search.BooleanQuery;
039 import org.apache.lucene.search.ConstantScoreRangeQuery;
040 import org.apache.lucene.search.FuzzyQuery;
041 import org.apache.lucene.search.PhraseQuery;
042 import org.apache.lucene.search.PrefixQuery;
043 import org.apache.lucene.search.Query;
044 import org.apache.lucene.search.TermQuery;
045 import org.apache.lucene.search.WildcardQuery;
046 import org.mycore.common.MCRUtils;
047 import org.mycore.services.fieldquery.MCRFieldDef;
048
049 /**
050 * This class builds a Lucene Query from XML query (specified by Frank
051 * Lützenkirchen)
052 *
053 * @author Harald Richter
054 *
055 * @version $Revision: 15386 $ $Date: 2009-06-22 09:54:52 +0200 (Mon, 22 Jun 2009) $
056 *
057 */
058 public class MCRBuildLuceneQuery {
059 private static final Logger LOGGER = Logger.getLogger(MCRBuildLuceneQuery.class);
060
061 static
062 {
063 BooleanQuery.setMaxClauseCount( 10000 );
064 }
065
066 static Hashtable search = null;
067
068 /**
069 * Build Lucene Query from XML
070 *
071 * @return Lucene Query
072 *
073 */
074 public static Query buildLuceneQuery(BooleanQuery r, boolean reqf, List f, Analyzer analyzer) throws Exception {
075 for (int i = 0; i < f.size(); i++) {
076 org.jdom.Element xEle = (org.jdom.Element) (f.get(i));
077 String name = xEle.getName();
078 if ("boolean".equals(name))
079 name = xEle.getAttributeValue("operator").toLowerCase();
080 Query x = null;
081
082 boolean reqfn = reqf;
083 boolean prof = false;
084
085 if (name.equals("and")) {
086 x = buildLuceneQuery(null, true, xEle.getChildren(), analyzer);
087 } else if (name.equalsIgnoreCase("or")) {
088 x = buildLuceneQuery(null, false, xEle.getChildren(), analyzer);
089 } else if (name.equalsIgnoreCase("not")) {
090 x = buildLuceneQuery(null, false, xEle.getChildren(), analyzer);
091 reqfn = false; // javadoc lucene: It is an error to specify a
092 // clause as both required and prohibited
093 prof = true;
094 } else if (name.equalsIgnoreCase("condition")) {
095 String field = xEle.getAttributeValue("field", "");
096 String operator = xEle.getAttributeValue("operator", "");
097 String value = xEle.getAttributeValue("value", "");
098
099 LOGGER.debug("field: " + field + " operator: " + operator + " value: " + value);
100
101 String fieldtype = MCRFieldDef.getDef( field ).getDataType();
102
103 if ("name".equals(fieldtype)) {
104 fieldtype = "text";
105 }
106 if("index".equals(fieldtype)){
107 fieldtype = "identifier";
108 value = convertToGermanIndexString(value);
109 }
110
111 x = handleCondition(field, operator, value, fieldtype, reqf, analyzer);
112 }
113
114 if (null != x) {
115 if (null == r) {
116 r = new BooleanQuery();
117 }
118
119 //BooleanClause bq = new BooleanClause(x, reqfn, prof);
120 BooleanClause.Occur occur = BooleanClause.Occur.MUST;
121
122 if (reqfn && !prof )
123 ;
124 else if ( !reqfn && !prof)
125 occur = BooleanClause.Occur.SHOULD;
126 else if ( !reqfn && prof)
127 occur = BooleanClause.Occur.MUST_NOT;
128 BooleanClause bq = new BooleanClause(x, occur);
129 r.add(bq);
130 }
131 } // for
132
133 return r;
134 }
135
136 private static Query handleCondition(String field, String operator, String value, String fieldtype, boolean reqf, Analyzer analyzer) throws Exception {
137 if ("text".equals(fieldtype) && "contains".equals(operator)) {
138 BooleanQuery bq = null;
139
140 Term te;
141 TermQuery tq = null;
142
143 TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
144 Token to;
145
146 while ((to = ts.next()) != null) {
147 te = new Term(field, to.termText());
148
149 if ((null != tq) && (null == bq)) // not first token
150 {
151 bq = new BooleanQuery();
152 //bq.add(tq, reqf, false);
153 if (reqf)
154 bq.add(tq, BooleanClause.Occur.MUST);
155 else
156 bq.add(tq, BooleanClause.Occur.SHOULD);
157 }
158
159 tq = new TermQuery(te);
160
161 if (null != bq) {
162 //bq.add(tq, reqf, false);
163 if (reqf)
164 bq.add(tq, BooleanClause.Occur.MUST);
165 else
166 bq.add(tq, BooleanClause.Occur.SHOULD);
167 }
168 }
169
170 if (null != bq) {
171 return bq;
172 }
173 return tq;
174 } else if (("text".equals(fieldtype) || "identifier".equals(fieldtype)) && "like".equals(operator)) {
175 Term te;
176
177 String help = value.endsWith("*") ? value.substring(0, value.length()-1) : value;
178
179 if ((-1 != help.indexOf("*")) || (-1 != help.indexOf("?"))) {
180 LOGGER.debug("WILDCARD");
181
182 te = new Term(field, value);
183 return new WildcardQuery(te);
184 }
185
186 te = new Term(field, help);
187 return new PrefixQuery(te);
188 } else if ("text".equals(fieldtype) && ("phrase".equals(operator) || "=".equals(operator))) {
189 Term te;
190 PhraseQuery pq = new PhraseQuery();
191 TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
192 Token to;
193
194 while ((to = ts.next()) != null) {
195 te = new Term(field, to.termText());
196 pq.add(te);
197 }
198
199 return pq;
200 } else if ("text".equals(fieldtype) && "fuzzy".equals(operator)) // 1.9.05
201 // future
202 // use
203 {
204 Term te;
205 value = fixQuery(value);
206 te = new Term(field, value);
207
208 return new FuzzyQuery(te);
209 } else if ("text".equals(fieldtype) && "range".equals(operator)) // 1.9.05
210 // future
211 // use
212 {
213 String lower = null;
214 String upper = null;
215 TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
216 Token to;
217
218 to = ts.next();
219 lower = to.termText();
220 to = ts.next();
221
222 if (null != to) {
223 upper = to.termText();
224 }
225
226 return new ConstantScoreRangeQuery(field, lower, upper, true, true);
227 } else if ("date".equals(fieldtype) || "time".equals(fieldtype) || "timestamp".equals(fieldtype)) {
228 return DateQuery2(field, operator, value);
229 } else if ("identifier".equals(fieldtype) && "=".equals(operator)) {
230 Term te = new Term(field, value);
231
232 return new TermQuery(te);
233 } else if ("boolean".equals(fieldtype) ) {
234 Term te = new Term(field, "true".equals(value) ? "1" : "0");
235
236 return new TermQuery(te);
237 } else if ("decimal".equals(fieldtype)) {
238 return NumberQuery(field, "decimal", operator, value);
239 } else if ("integer".equals(fieldtype)) {
240 return NumberQuery(field, "integer", operator, value);
241 } else if ("text".equals(fieldtype) && "lucene".equals(operator)) // value
242 // contains
243 // query
244 // for
245 // lucene,
246 // use query parser
247 {
248 QueryParser qp = new QueryParser(field, analyzer);
249 Query query = qp.parse( fixQuery(value) );
250
251 LOGGER.debug("Lucene query: " + query.toString());
252
253 return query;
254 } else {
255 LOGGER.info("Not supported, fieldtype: " + fieldtype + " operator: " + operator);
256 }
257
258 return null;
259 }
260
261 // code from Otis Gospodnetic http://www.jguru.com/faq/view.jsp?EID=538312
262 // Question Are Wildcard, Prefix, and Fuzzy queries case sensitive?
263 // Yes, unlike other types of Lucene queries, Wildcard, Prefix, and Fuzzy
264 // queries are case sensitive. That is because those types of queries are
265 // not passed through the Analyzer, which is the component that performs
266 // operations such as stemming and lowercasing.
267 public static String fixQuery(String aQuery) {
268 aQuery = MCRUtils.replaceString(aQuery, "'", "\""); // handle phrase
269
270 StringTokenizer _tokenizer = new StringTokenizer(aQuery, " \t\n\r", true);
271 StringBuffer _fixedQuery = new StringBuffer(aQuery.length());
272 boolean _inString = false;
273
274 while (_tokenizer.hasMoreTokens()) {
275 String _token = _tokenizer.nextToken();
276
277 if ((!"NOT".equals(_token) && !"AND".equals(_token) && !"OR".equals(_token) && !"TO".equals(_token)) || _inString) {
278 _fixedQuery.append(_token.toLowerCase());
279 } else {
280 _fixedQuery.append(_token);
281 }
282
283 int _nbQuotes = count(_token, "\""); // Count the "
284 int _nbEscapedQuotes = count(_token, "\\\""); // Count the \"
285
286 if (((_nbQuotes - _nbEscapedQuotes) % 2) != 0) {
287 // there is an odd number of string delimiters
288 _inString = !_inString;
289 }
290 }
291
292 String qu = _fixedQuery.toString();
293 qu = MCRUtils.replaceString(qu, "ä", "a");
294 qu = MCRUtils.replaceString(qu, "ö", "o");
295 qu = MCRUtils.replaceString(qu, "ü", "u");
296 qu = MCRUtils.replaceString(qu, "ß", "ss");
297
298 return qu;
299 }
300
301 private static int count(String aSourceString, String aCountString) {
302 int fromIndex = 0;
303 int foundIndex = 0;
304 int count = 0;
305
306 while ((foundIndex = aSourceString.indexOf(aCountString, fromIndex)) > -1) {
307 count++;
308 fromIndex = ++foundIndex;
309 }
310
311 return count;
312 }
313
314 /***************************************************************************
315 * NumberQuery ()
316 **************************************************************************/
317 private static Query NumberQuery(String fieldname, String type, String Op, String value) throws Exception {
318 if (value.length() == 0) {
319 return null;
320 }
321
322 String lower = "0000000000000000000";
323 String upper = "9999999999999999999";
324
325 if (Op.equals(">") || Op.equals(">=") ) {
326 lower = MCRLuceneSearcher.handleNumber(value, type, Op.equals(">") ? 1 : 0);
327 upper = upper.substring(0, lower.length() );
328 } else if (Op.equals("<") || Op.equals("<=") ) {
329 upper = MCRLuceneSearcher.handleNumber(value, type, Op.equals("<") ? -1 : 0);
330 lower = lower.substring(0, upper.length() );
331 } else if (Op.equals("=")) {
332 return new TermQuery(new Term(fieldname, MCRLuceneSearcher.handleNumber(value, type, 0)));
333 } else {
334 LOGGER.info("Invalid operator for Number: " + Op);
335
336 return null;
337 }
338
339 return new ConstantScoreRangeQuery(fieldname, lower, upper, true, true);
340 }
341
342 /***************************************************************************
343 * DateQuery2 ()
344 **************************************************************************/
345 private static Query DateQuery2(String fieldname, String Op, String value) {
346 if (value.length() == 0) {
347 return null;
348 }
349
350 String lower = null;
351 String upper = null;
352
353 if (Op.equals(">") || Op.equals(">=") ) {
354 lower = value;
355 } else if (Op.equals("<") || Op.equals("<=") ) {
356 upper = value;
357 } else if (Op.equals("=")) {
358 return new TermQuery( new Term(fieldname, value) );
359 } else {
360 LOGGER.info("Invalid operator for Number: " + Op);
361
362 return null;
363 }
364
365 boolean incl = Op.equals(">=") || Op.equals("<=") ? true : false;
366 // return new RangeQuery( lower, upper, incl);
367 return new ConstantScoreRangeQuery(fieldname, lower, upper, incl, incl);
368 }
369
370 /**
371 * Converts the given string to lower case and replaces German Umlaute and SZ
372 * @param s the input string
373 * @return the converted string
374 */
375 public static String convertToGermanIndexString(String s){
376 return s.toLowerCase().replaceAll("ä", "ae").replaceAll("ö", "oe")
377 .replaceAll("ü", "ue").replaceAll("ß", "ss");
378 }
379 }