001    /*
002     * 
003     * $Revision: 14943 $ $Date: 2009-03-18 12:07:51 +0100 (Wed, 18 Mar 2009) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.backend.jdom;
025    
026    import java.io.ByteArrayOutputStream;
027    import java.util.Enumeration;
028    import java.util.HashMap;
029    import java.util.Iterator;
030    import java.util.List;
031    import java.util.Properties;
032    import java.util.StringTokenizer;
033    import java.util.regex.Pattern;
034    
035    import javax.xml.transform.Source;
036    import javax.xml.transform.Transformer;
037    import javax.xml.transform.TransformerConfigurationException;
038    import javax.xml.transform.TransformerFactory;
039    import javax.xml.transform.stream.StreamResult;
040    
041    import org.apache.log4j.Logger;
042    import org.jdom.Document;
043    import org.jdom.Element;
044    import org.jdom.Namespace;
045    import org.jdom.output.Format;
046    import org.jdom.output.XMLOutputter;
047    import org.jdom.transform.JDOMSource;
048    import org.mycore.common.MCRConfiguration;
049    import org.mycore.common.MCRConstants;
050    import org.mycore.common.MCRException;
051    import org.mycore.datamodel.metadata.MCRObject;
052    import org.mycore.datamodel.metadata.MCRObjectID;
053    import org.mycore.datamodel.common.MCRXMLTableManager;
054    import org.mycore.frontend.editor.MCRInputValidator;
055    import org.mycore.parsers.bool.MCRAndCondition;
056    import org.mycore.parsers.bool.MCRCondition;
057    import org.mycore.parsers.bool.MCRNotCondition;
058    import org.mycore.parsers.bool.MCROrCondition;
059    import org.mycore.services.fieldquery.MCRData2Fields;
060    import org.mycore.services.fieldquery.MCRFieldDef;
061    import org.mycore.services.fieldquery.MCRFieldValue;
062    import org.mycore.services.fieldquery.MCRHit;
063    import org.mycore.services.fieldquery.MCRResults;
064    import org.mycore.services.fieldquery.MCRSearcher;
065    import org.mycore.services.fieldquery.MCRQueryCondition;
066    import org.mycore.services.fieldquery.MCRSortBy;
067    
068    /**
069     * Implements a searcher and indexer for MCRObject metadata using only data in
070     * memory without any persistent structures. When data is indexed, the values
071     * are stored as XML document in memory. When data is searched, the query is
072     * transformed to a XSL condition and run against the XML in memory. Before
073     * first use of instances of this class, all MCRObject metadata is loaded from
074     * persistent store and indexed in memory. This class may also be useful for
075     * learning how to implement MCRSearchers and indexers.
076     * 
077     * @author Frank Lützenkirchen
078     */
079    public class MCRJDOMSearcher extends MCRSearcher {
080        /** The logger */
081        private final static Logger LOGGER = Logger.getLogger(MCRJDOMSearcher.class);
082    
083        /**
084         * Map where key is entryID and value is XML document containing indexed
085         * data
086         */
087        private HashMap<String,Document> map = new HashMap<String,Document>();
088    
089        /** XSL transformer factory */
090        private TransformerFactory factory = TransformerFactory.newInstance();
091    
092        public void init(String ID) {
093            super.init(ID);
094    
095            MCRXMLTableManager mcr_xml = MCRXMLTableManager.instance();
096    
097            // Find all types of MCRObject data:
098            String cfgPrefix = "MCR.Metadata.Config.";
099            Properties props = MCRConfiguration.instance().getProperties(cfgPrefix);
100            for (Enumeration keys = props.keys(); keys.hasMoreElements();) {
101                String key = (String) (keys.nextElement());
102                String type = key.substring(cfgPrefix.length());
103                if ("derivate".equals(type))
104                    continue;
105    
106                LOGGER.debug("Now indexing metadata of all stored MCRObjects from type " + type);
107    
108                try {
109                    List IDs = mcr_xml.retrieveAllIDs(type);
110                    int numObjects = IDs.size();
111                    for (int i = 0; i < numObjects; i++) {
112                        String sid = (String) (IDs.get(i));
113                        MCRObject obj = new MCRObject();
114                        MCRObjectID oid = new MCRObjectID(sid);
115                        obj.setId(oid);
116                        obj.setFromXML(mcr_xml.retrieveAsXML(oid), false);
117                        List fields = MCRData2Fields.buildFields(obj, index);
118                        addToIndex(sid, sid, fields);
119                    }
120                } catch (Exception ex) {
121                    LOGGER.error(ex);
122                }
123            }
124        }
125    
126        public void addToIndex(String entryID, String returnID, List fields) {
127            if ((fields == null) || (fields.size() == 0)) {
128                return;
129            }
130    
131            LOGGER.info("MCRJDOMSearcher indexing data of " + entryID);
132            Element data = new Element("data");
133            data.setAttribute("returnID", returnID);
134    
135            for (int i = 0; i < fields.size(); i++) {
136                MCRFieldValue fv = (MCRFieldValue) (fields.get(i));
137                Element field = new Element(fv.getField().getName());
138                field.addContent(fv.getValue());
139                data.addContent(field);
140            }
141    
142            if (LOGGER.isDebugEnabled()) {
143                String s = new XMLOutputter(Format.getPrettyFormat()).outputString(data);
144                LOGGER.debug("----------" + entryID + "----------");
145                LOGGER.debug(s);
146                LOGGER.debug("-----------------------------------");
147            }
148    
149            map.put(entryID, new Document(data));
150        }
151    
152        public void removeFromIndex(String entryID) {
153            LOGGER.info("MCRJDOMSearcher removing indexed data of " + entryID);
154            map.remove(entryID);
155        }
156    
157        public MCRResults search(MCRCondition condition, int maxResults, List sortBy, boolean addSortData) {
158            String xslCondition = buildXSLCondition(condition);
159            LOGGER.debug("MCRJDOMSearcher searching for " + xslCondition);
160    
161            Transformer transformer = buildStylesheet(xslCondition);
162            ByteArrayOutputStream out = new ByteArrayOutputStream();
163    
164            MCRResults results = new MCRResults();
165    
166            for (Iterator keys = map.keySet().iterator(); keys.hasNext();) {
167                String entryID = (String) (keys.next());
168                Document xml = map.get(entryID);
169    
170                if (matches(xml, transformer, out)) {
171                    String returnID = xml.getRootElement().getAttributeValue("returnID");
172                    MCRHit hit = new MCRHit(returnID);
173    
174                    // Add values of all fields that may be sort criteria
175                    for (int i = 0; i < sortBy.size(); i++) {
176                        MCRSortBy by = (MCRSortBy) (sortBy.get(i));
177    
178                        List values = xml.getRootElement().getChildren(by.getField().getName());
179                        for (Iterator itv = values.iterator(); itv.hasNext();) {
180                            Element value = (Element) (itv.next());
181                            MCRFieldDef def = MCRFieldDef.getDef(value.getName());
182                            hit.addSortData(new MCRFieldValue(def, value.getText()));
183                        }
184                    }
185    
186                    results.addHit(hit);
187                }
188    
189                if (sortBy.isEmpty() && (maxResults > 0) && (results.getNumHits() >= maxResults))
190                    break;
191            }
192    
193            LOGGER.debug("MCRJDOMSearcher results completed");
194            return results;
195        }
196    
197        /**
198         * Returns true if the xml input document matches the xsl when condition in
199         * the xsl stylesheet.
200         */
201        private boolean matches(Document xml, Transformer transformer, ByteArrayOutputStream out) {
202            Source xmlsrc = new JDOMSource(xml);
203    
204            try {
205                out.reset();
206                transformer.transform(xmlsrc, new StreamResult(out));
207                out.flush();
208    
209                return "t".equals(out.toString("UTF-8"));
210            } catch (Exception ex) {
211                LOGGER.warn("Exception while testing indexed data with XSL condition", ex);
212    
213                return false;
214            }
215        }
216    
217        /**
218         * XSL stylesheet template where only the when test attribute has to be
219         * added
220         */
221        private Document xslTemplate = null;
222    
223        /** Prepares an XSL stylesheet in memory used as template */
224        private Document prepareStylesheet() {
225            Namespace extns = Namespace.getNamespace("ext", "xalan://org.mycore.backend.jdom.MCRJDOMSearcher");
226    
227            Element stylesheet = new Element("stylesheet");
228            stylesheet.setAttribute("version", "1.0");
229            stylesheet.setNamespace(MCRConstants.XSL_NAMESPACE);
230            stylesheet.addNamespaceDeclaration(MCRFieldDef.xalanns);
231            stylesheet.addNamespaceDeclaration(extns);
232            stylesheet.setAttribute("extension-element-prefixes", "ext");
233    
234            Element output = new Element("output", MCRConstants.XSL_NAMESPACE);
235            output.setAttribute("method", "text");
236            stylesheet.addContent(output);
237    
238            Element template = new Element("template", MCRConstants.XSL_NAMESPACE);
239            template.setAttribute("match", "/data");
240            stylesheet.addContent(template);
241    
242            Element choose = new Element("choose", MCRConstants.XSL_NAMESPACE);
243            template.addContent(choose);
244    
245            Element when = new Element("when", MCRConstants.XSL_NAMESPACE);
246            when.addContent("t");
247    
248            Element otherwise = new Element("otherwise", MCRConstants.XSL_NAMESPACE);
249            otherwise.addContent("f");
250            choose.addContent(when).addContent(otherwise);
251    
252            return new Document(stylesheet);
253        }
254    
255        /** Adds the condition as xsl when test attribute to the stylesheet template */
256        private Transformer buildStylesheet(String condition) {
257            if (xslTemplate == null) {
258                xslTemplate = prepareStylesheet();
259            }
260    
261            Document xsl = (Document) (xslTemplate.clone());
262            xsl.getRootElement().getChild("template", MCRConstants.XSL_NAMESPACE).getChild("choose", MCRConstants.XSL_NAMESPACE).getChild("when", MCRConstants.XSL_NAMESPACE).setAttribute("test", condition);
263            Source xslsrc = new JDOMSource(xsl);
264            Transformer transformer;
265            try {
266                transformer = factory.newTransformer(xslsrc);
267            } catch (TransformerConfigurationException ex) {
268                String msg = "Could not compile XSL stylesheet to be used for searching";
269                throw new MCRException(msg, ex);
270            }
271    
272            return transformer;
273        }
274    
275        /** Converter from MCRCondition to XSL test condition */
276        private String buildXSLCondition(MCRCondition cond) {
277            if (cond instanceof MCRQueryCondition) {
278                MCRQueryCondition sc = (MCRQueryCondition) cond;
279                StringBuffer sb = new StringBuffer(sc.getField().getName());
280                sb.append("[");
281    
282                if ("= < > <= >=".indexOf(sc.getOperator()) >= 0) {
283                    String type = sc.getField().getDataType();
284    
285                    if ("integer".equals(type) || "decimal".equals(type)) {
286                        sb.append("number(text()) ");
287                        sb.append(sc.getOperator());
288                        sb.append(" ");
289                        sb.append(sc.getValue());
290                    } else {
291                        sb.append("ext:compare(text(),'");
292                        sb.append(sc.getValue());
293                        sb.append("','");
294                        sb.append(sc.getOperator());
295                        sb.append("')");
296                    }
297                } else if ("phrase".equals(sc.getOperator())) {
298                    sb.append("contains(text(),'");
299                    sb.append(sc.getValue()).append("')");
300                } else if ("contains".equals(sc.getOperator())) {
301                    sb.append("ext:contains(text(),'");
302                    sb.append(sc.getValue()).append("')");
303                } else if ("like".equals(sc.getOperator())) {
304                    sb.append("ext:like(text(),'");
305                    sb.append(sc.getValue()).append("')");
306                }
307    
308                sb.append("]");
309    
310                return sb.toString();
311            } else if (cond instanceof MCRNotCondition) {
312                MCRNotCondition nc = (MCRNotCondition) cond;
313                return "not(" + buildXSLCondition(nc.getChild()) + ")";
314            } else if (cond instanceof MCRAndCondition) {
315                MCRAndCondition ac = (MCRAndCondition) cond;
316                return buildXSLCondition(ac.getChildren(), "and");
317            } else if (cond instanceof MCROrCondition) {
318                MCROrCondition oc = (MCROrCondition) cond;
319                return buildXSLCondition(oc.getChildren(), "or");
320            } else {
321                return "";
322            }
323        }
324    
325        /** Builds a combined and/or XSL condition */
326        private String buildXSLCondition(List children, String operator) {
327            StringBuffer sb = new StringBuffer();
328            sb.append("(");
329    
330            for (int i = 0; i < children.size(); i++) {
331                MCRCondition sc = (MCRCondition) (children.get(i));
332                sb.append(buildXSLCondition(sc));
333    
334                if (i < (children.size() - 1)) {
335                    sb.append(" ").append(operator).append(" ");
336                }
337            }
338    
339            sb.append(")");
340            return sb.toString();
341        }
342    
343        /** Implements the contains operator as Xalan function extension */
344        public static boolean contains(String value, String words) {
345            if ((value == null) || (value.trim().length() == 0)) {
346                return false;
347            }
348    
349            if ((words == null) || (words.trim().length() == 0)) {
350                return true;
351            }
352    
353            StringTokenizer st = new StringTokenizer(words);
354            while (st.hasMoreTokens())
355    
356                if (value.indexOf(st.nextToken()) == -1) {
357                    return false;
358                }
359    
360            return true;
361        }
362    
363        /** Implements the like operator as Xalan function extension */
364        public static boolean like(String value, String pattern) {
365            if ((value == null) || (value.trim().length() == 0)) {
366                return false;
367            }
368    
369            if ((pattern == null) || (pattern.trim().length() == 0)) {
370                return true;
371            }
372    
373            if (!pattern.endsWith("*"))
374                pattern = pattern + "*";
375            if (!pattern.startsWith("*"))
376                pattern = "*" + pattern;
377    
378            pattern = pattern.replaceAll("\\?", ".");
379            pattern = pattern.replaceAll("\\*", "(.*)");
380    
381            LOGGER.debug("Search regex " + pattern + " in text \"" + value + "\"");
382    
383            return Pattern.matches(pattern, value);
384        }
385    
386        /** Implements a string compare operator as Xalan function extension */
387        public static boolean compare(String valueA, String valueB, String operator) {
388            return MCRInputValidator.instance().compare(valueA, valueB, operator, "string", null);
389        }
390    
391        public void addSortData(Iterator<MCRHit> hits, List<MCRSortBy> sortBy) {
392            while (hits.hasNext()) {
393                MCRHit hit = (MCRHit) hits.next();
394                Document data = map.get(hit.getID());
395    
396                for (int j = 0; j < sortBy.size(); j++) {
397                    MCRFieldDef fd = sortBy.get(j).getField();
398                    List values = data.getRootElement().getChildren(fd.getName());
399                    for (Iterator itv = values.iterator(); itv.hasNext();) {
400                        Element value = (Element) (itv.next());
401                        hit.addSortData(new MCRFieldValue(fd, value.getText()));
402                    }
403                }
404            }
405        }
406    }