001    /*
002     * 
003     * $Revision: 15386 $ $Date: 2009-06-22 09:54:52 +0200 (Mon, 22 Jun 2009) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.backend.lucene;
025    
026    import java.io.StringReader;
027    import java.util.Hashtable;
028    import java.util.List;
029    import java.util.StringTokenizer;
030    
031    import org.apache.log4j.Logger;
032    import org.apache.lucene.analysis.Analyzer;
033    import org.apache.lucene.analysis.Token;
034    import org.apache.lucene.analysis.TokenStream;
035    import org.apache.lucene.index.Term;
036    import org.apache.lucene.queryParser.QueryParser;
037    import org.apache.lucene.search.BooleanClause;
038    import org.apache.lucene.search.BooleanQuery;
039    import org.apache.lucene.search.ConstantScoreRangeQuery;
040    import org.apache.lucene.search.FuzzyQuery;
041    import org.apache.lucene.search.PhraseQuery;
042    import org.apache.lucene.search.PrefixQuery;
043    import org.apache.lucene.search.Query;
044    import org.apache.lucene.search.TermQuery;
045    import org.apache.lucene.search.WildcardQuery;
046    import org.mycore.common.MCRUtils;
047    import org.mycore.services.fieldquery.MCRFieldDef;
048    
049    /**
050     * This class builds a Lucene Query from XML query (specified by Frank
051     * Lützenkirchen)
052     * 
053     * @author Harald Richter
054     * 
055     * @version $Revision: 15386 $ $Date: 2009-06-22 09:54:52 +0200 (Mon, 22 Jun 2009) $
056     * 
057     */
058    public class MCRBuildLuceneQuery {
059        private static final Logger LOGGER = Logger.getLogger(MCRBuildLuceneQuery.class);
060    
061        static
062        {
063          BooleanQuery.setMaxClauseCount( 10000 );
064        }
065        
066        static Hashtable search = null;
067    
068        /**
069         * Build Lucene Query from XML
070         * 
071         * @return Lucene Query
072         * 
073         */
074        public static Query buildLuceneQuery(BooleanQuery r, boolean reqf, List f, Analyzer analyzer) throws Exception {
075            for (int i = 0; i < f.size(); i++) {
076                org.jdom.Element xEle = (org.jdom.Element) (f.get(i));
077                String name = xEle.getName();
078                if ("boolean".equals(name))
079                  name = xEle.getAttributeValue("operator").toLowerCase();
080                Query x = null;
081    
082                boolean reqfn = reqf;
083                boolean prof = false;
084    
085                if (name.equals("and")) {
086                    x = buildLuceneQuery(null, true, xEle.getChildren(), analyzer);
087                } else if (name.equalsIgnoreCase("or")) {
088                    x = buildLuceneQuery(null, false, xEle.getChildren(), analyzer);
089                } else if (name.equalsIgnoreCase("not")) {
090                    x = buildLuceneQuery(null, false, xEle.getChildren(), analyzer);
091                    reqfn = false; // javadoc lucene: It is an error to specify a
092                                    // clause as both required and prohibited
093                    prof = true;
094                } else if (name.equalsIgnoreCase("condition")) {
095                    String field = xEle.getAttributeValue("field", "");
096                    String operator = xEle.getAttributeValue("operator", "");
097                    String value = xEle.getAttributeValue("value", "");
098    
099                    LOGGER.debug("field: " + field + " operator: " + operator + " value: " + value);
100    
101                    String fieldtype = MCRFieldDef.getDef( field ).getDataType();
102    
103                    if ("name".equals(fieldtype)) {
104                        fieldtype = "text";
105                    }
106                    if("index".equals(fieldtype)){
107                            fieldtype = "identifier";
108                            value = convertToGermanIndexString(value);
109                    }
110    
111                    x = handleCondition(field, operator, value, fieldtype, reqf, analyzer);
112                }
113    
114                if (null != x) {
115                    if (null == r) {
116                        r = new BooleanQuery();
117                    }
118    
119                    //BooleanClause bq = new BooleanClause(x, reqfn, prof);
120                    BooleanClause.Occur occur = BooleanClause.Occur.MUST;
121                    
122                    if (reqfn && !prof )
123                      ;
124                    else if ( !reqfn && !prof)
125                      occur = BooleanClause.Occur.SHOULD; 
126                    else if ( !reqfn && prof)
127                      occur = BooleanClause.Occur.MUST_NOT; 
128                    BooleanClause bq = new BooleanClause(x, occur);
129                    r.add(bq);
130                }
131            } // for
132    
133            return r;
134        }
135    
136        private static Query handleCondition(String field, String operator, String value, String fieldtype, boolean reqf, Analyzer analyzer) throws Exception {
137            if ("text".equals(fieldtype) && "contains".equals(operator)) {
138                BooleanQuery bq = null;
139    
140                Term te;
141                TermQuery tq = null;
142    
143                TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
144                Token to;
145    
146                while ((to = ts.next()) != null) {
147                    te = new Term(field, to.termText());
148    
149                    if ((null != tq) && (null == bq)) // not first token
150                    {
151                        bq = new BooleanQuery();
152                        //bq.add(tq, reqf, false);
153                        if (reqf)
154                          bq.add(tq, BooleanClause.Occur.MUST);
155                        else
156                          bq.add(tq, BooleanClause.Occur.SHOULD);
157                    }
158    
159                    tq = new TermQuery(te);
160    
161                    if (null != bq) {
162                        //bq.add(tq, reqf, false);
163                        if (reqf)
164                          bq.add(tq, BooleanClause.Occur.MUST);
165                        else
166                          bq.add(tq, BooleanClause.Occur.SHOULD);
167                    }
168                }
169    
170                if (null != bq) {
171                    return bq;
172                }
173                return tq;
174            } else if (("text".equals(fieldtype) || "identifier".equals(fieldtype)) && "like".equals(operator)) {
175                Term te;
176                
177                String help = value.endsWith("*") ? value.substring(0, value.length()-1) : value;
178    
179                if ((-1 != help.indexOf("*")) || (-1 != help.indexOf("?"))) {
180                    LOGGER.debug("WILDCARD");
181    
182                    te = new Term(field, value);
183                    return new WildcardQuery(te);
184                }
185    
186                te = new Term(field, help);
187                return new PrefixQuery(te);
188            } else if ("text".equals(fieldtype) && ("phrase".equals(operator) || "=".equals(operator))) {
189                Term te;
190                PhraseQuery pq = new PhraseQuery();
191                TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
192                Token to;
193    
194                while ((to = ts.next()) != null) {
195                    te = new Term(field, to.termText());
196                    pq.add(te);
197                }
198    
199                return pq;
200            } else if ("text".equals(fieldtype) && "fuzzy".equals(operator)) // 1.9.05
201                                                                                // future
202                                                                                // use
203            {
204                Term te;
205                value = fixQuery(value);
206                te = new Term(field, value);
207    
208                return new FuzzyQuery(te);
209            } else if ("text".equals(fieldtype) && "range".equals(operator)) // 1.9.05
210                                                                                // future
211                                                                                // use
212            {
213                String lower = null;
214                String upper = null;
215                TokenStream ts = analyzer.tokenStream(field, new StringReader(value));
216                Token to;
217    
218                to = ts.next();
219                lower = to.termText();
220                to = ts.next();
221    
222                if (null != to) {
223                    upper = to.termText();
224                }
225    
226                return new ConstantScoreRangeQuery(field, lower, upper, true, true);
227            } else if ("date".equals(fieldtype) || "time".equals(fieldtype) || "timestamp".equals(fieldtype)) {
228                return DateQuery2(field, operator, value);
229            } else if ("identifier".equals(fieldtype) && "=".equals(operator)) {
230                Term te = new Term(field, value);
231    
232                return new TermQuery(te);
233            } else if ("boolean".equals(fieldtype) ) {
234                Term te = new Term(field, "true".equals(value) ? "1" : "0");
235    
236                return new TermQuery(te);
237            } else if ("decimal".equals(fieldtype)) {  
238              return NumberQuery(field, "decimal", operator, value);
239            } else if ("integer".equals(fieldtype)) {  
240              return NumberQuery(field, "integer", operator, value);
241            } else if ("text".equals(fieldtype) && "lucene".equals(operator)) // value
242                                                                                // contains
243                                                                                // query
244                                                                                // for
245                                                                                // lucene,
246            // use query parser
247            {
248                QueryParser qp = new QueryParser(field, analyzer);
249                Query query = qp.parse( fixQuery(value) );
250                
251                LOGGER.debug("Lucene query: " + query.toString());
252    
253                return query;
254            } else {
255                LOGGER.info("Not supported, fieldtype: " + fieldtype + " operator: " + operator);
256            }
257    
258            return null;
259        }
260    
261        // code from Otis Gospodnetic http://www.jguru.com/faq/view.jsp?EID=538312
262        // Question Are Wildcard, Prefix, and Fuzzy queries case sensitive?
263        // Yes, unlike other types of Lucene queries, Wildcard, Prefix, and Fuzzy
264        // queries are case sensitive. That is because those types of queries are
265        // not passed through the Analyzer, which is the component that performs
266        // operations such as stemming and lowercasing.
267        public static String fixQuery(String aQuery) {
268            aQuery = MCRUtils.replaceString(aQuery, "'", "\""); // handle phrase
269    
270            StringTokenizer _tokenizer = new StringTokenizer(aQuery, " \t\n\r", true);
271            StringBuffer _fixedQuery = new StringBuffer(aQuery.length());
272            boolean _inString = false;
273    
274            while (_tokenizer.hasMoreTokens()) {
275                String _token = _tokenizer.nextToken();
276    
277                if ((!"NOT".equals(_token) && !"AND".equals(_token) && !"OR".equals(_token) && !"TO".equals(_token)) || _inString) {
278                    _fixedQuery.append(_token.toLowerCase());
279                } else {
280                    _fixedQuery.append(_token);
281                }
282    
283                int _nbQuotes = count(_token, "\""); // Count the "
284                int _nbEscapedQuotes = count(_token, "\\\""); // Count the \"
285    
286                if (((_nbQuotes - _nbEscapedQuotes) % 2) != 0) {
287                    // there is an odd number of string delimiters
288                    _inString = !_inString;
289                }
290            }
291    
292            String qu = _fixedQuery.toString();
293            qu = MCRUtils.replaceString(qu, "ä", "a");
294            qu = MCRUtils.replaceString(qu, "ö", "o");
295            qu = MCRUtils.replaceString(qu, "ü", "u");
296            qu = MCRUtils.replaceString(qu, "ß", "ss");
297    
298            return qu;
299        }
300    
301        private static int count(String aSourceString, String aCountString) {
302            int fromIndex = 0;
303            int foundIndex = 0;
304            int count = 0;
305    
306            while ((foundIndex = aSourceString.indexOf(aCountString, fromIndex)) > -1) {
307                count++;
308                fromIndex = ++foundIndex;
309            }
310    
311            return count;
312        }
313    
314        /***************************************************************************
315         * NumberQuery ()
316         **************************************************************************/
317        private static Query NumberQuery(String fieldname, String type, String Op, String value) throws Exception {
318            if (value.length() == 0) {
319                return null;
320            }
321            
322            String lower = "0000000000000000000";
323            String upper = "9999999999999999999";
324            
325            if (Op.equals(">") || Op.equals(">=") ) {
326              lower = MCRLuceneSearcher.handleNumber(value, type, Op.equals(">") ? 1 : 0);
327              upper = upper.substring(0, lower.length() );
328          } else if (Op.equals("<") || Op.equals("<=") ) {
329              upper = MCRLuceneSearcher.handleNumber(value, type, Op.equals("<") ? -1 : 0);
330              lower = lower.substring(0, upper.length() );
331          } else if (Op.equals("=")) {
332              return new TermQuery(new Term(fieldname, MCRLuceneSearcher.handleNumber(value, type, 0)));
333          } else {
334              LOGGER.info("Invalid operator for Number: " + Op);
335    
336              return null;
337          }
338    
339          return new ConstantScoreRangeQuery(fieldname, lower, upper, true, true);
340        }
341        
342        /***************************************************************************
343         * DateQuery2 ()
344         **************************************************************************/
345        private static Query DateQuery2(String fieldname,  String Op, String value) {
346            if (value.length() == 0) {
347                return null;
348            }
349            
350            String lower = null;
351            String upper = null;
352            
353            if (Op.equals(">") || Op.equals(">=") ) {
354              lower = value;
355          } else if (Op.equals("<") || Op.equals("<=") ) {
356              upper = value;
357          } else if (Op.equals("=")) {
358              return new TermQuery( new Term(fieldname, value) );
359          } else {
360              LOGGER.info("Invalid operator for Number: " + Op);
361    
362              return null;
363          }
364    
365          boolean incl = Op.equals(">=") || Op.equals("<=") ? true : false;   
366    //      return new RangeQuery( lower, upper, incl);
367          return new ConstantScoreRangeQuery(fieldname, lower, upper, incl, incl);
368        }
369        
370        /**
371         * Converts the given string to lower case and replaces German Umlaute and SZ 
372         * @param s the input string
373         * @return the converted string
374         */
375        public static String convertToGermanIndexString(String s){
376            return s.toLowerCase().replaceAll("ä", "ae").replaceAll("ö", "oe")
377                    .replaceAll("ü", "ue").replaceAll("ß", "ss");
378        }
379    }