001    /*
002     * 
003     * $Revision: 14944 $ $Date: 2009-03-18 12:19:32 +0100 (Wed, 18 Mar 2009) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.services.fieldquery;
025    
026    import java.io.ByteArrayInputStream;
027    import java.util.ArrayList;
028    import java.util.List;
029    
030    import javax.xml.transform.Source;
031    import javax.xml.transform.Templates;
032    import javax.xml.transform.Transformer;
033    import javax.xml.transform.TransformerConfigurationException;
034    import javax.xml.transform.TransformerFactory;
035    import javax.xml.transform.sax.SAXTransformerFactory;
036    import javax.xml.transform.stream.StreamSource;
037    
038    import org.apache.log4j.Logger;
039    import org.jdom.Document;
040    import org.jdom.Element;
041    import org.jdom.Namespace;
042    import org.jdom.output.XMLOutputter;
043    import org.jdom.transform.JDOMResult;
044    import org.jdom.transform.JDOMSource;
045    import org.mycore.common.MCRCache;
046    import org.mycore.common.MCRConfigurationException;
047    import org.mycore.common.MCRConstants;
048    import org.mycore.common.MCRException;
049    import org.mycore.common.xml.MCRURIResolver;
050    import org.mycore.datamodel.ifs.MCRFile;
051    import org.mycore.datamodel.metadata.MCRMetaISO8601Date;
052    import org.mycore.datamodel.metadata.MCRObject;
053    
054    /**
055     * Provides methods to automatically extract field values for indexing from
056     * MCRObject, MCRFile or any XML document using the definition in
057     * searchfields.xml. The buildFields method returns a list of MCRFieldValue
058     * objects with values extracted from the object for the given search index.
059     * This class supports extracting values from MCRObject metadata, MCRFile
060     * metadata, MCRFile xml content. MCRFile additional data, MCRFile text content
061     * using the text filter plug-ins, and any plain XML document.
062     * 
063     * @see MCRSearcher#addToIndex(String, List)
064     * @author Frank L�tzenkirchen
065     */
066    public class MCRData2Fields {
067    
068        /** The logger */
069        private static final Logger LOGGER = Logger.getLogger(MCRData2Fields.class);
070    
071        /** The XSL transformer factory to use */
072        private static SAXTransformerFactory factory;
073    
074        /** A template element to be used for building individual stylesheet */
075        private static Element xslTemplate;
076    
077        private static MCRCache stylesheets = new MCRCache(20, "data2searchfields stylesheets");
078    
079        static {
080            TransformerFactory tf = TransformerFactory.newInstance();
081            if (!tf.getFeature(SAXTransformerFactory.FEATURE)) {
082                throw new MCRConfigurationException("Could not load a SAXTransformerFactory for use with XSLT");
083            }
084    
085            factory = (SAXTransformerFactory) (tf);
086            factory.setURIResolver(MCRURIResolver.instance());
087    
088            xslTemplate = new Element("stylesheet");
089            xslTemplate.setAttribute("version", "1.0");
090            xslTemplate.setNamespace(MCRConstants.XSL_NAMESPACE);
091            xslTemplate.addNamespaceDeclaration(Namespace.XML_NAMESPACE);
092            xslTemplate.addNamespaceDeclaration(MCRConstants.XLINK_NAMESPACE);
093            xslTemplate.addNamespaceDeclaration(MCRFieldDef.xalanns);
094            xslTemplate.addNamespaceDeclaration(MCRFieldDef.extns);
095            xslTemplate.setAttribute("extension-element-prefixes", "ext");
096    
097            Element param = new Element("param", MCRConstants.XSL_NAMESPACE);
098            param.setAttribute("name", "objectType");
099            xslTemplate.addContent(param);
100    
101            Element template = new Element("template", MCRConstants.XSL_NAMESPACE);
102            template.setAttribute("match", "/");
103            xslTemplate.addContent(template);
104    
105            Element fieldValues = new Element("fieldValues", MCRConstants.MCR_NAMESPACE);
106            template.addContent(fieldValues);
107        }
108    
109        private static Templates buildStylesheet(String index, String source) {
110            String key = index + "//" + source;
111            Templates stylesheet = (Templates) (stylesheets.get(key));
112    
113            if (stylesheet == null) {
114                Element root = (Element) (xslTemplate.clone());
115                Element fv = root.getChild("template", MCRConstants.XSL_NAMESPACE).getChild("fieldValues", MCRConstants.MCR_NAMESPACE);
116    
117                List<MCRFieldDef> fieldDefs = MCRFieldDef.getFieldDefs(index);
118                for (int i = 0; i < fieldDefs.size(); i++) {
119                    MCRFieldDef fieldDef = fieldDefs.get(i);
120                    if (source.indexOf(fieldDef.getSource()) == -1)
121                        continue;
122                    Element fragment = fieldDef.getXSL();
123                    if (fragment != null)
124                        fv.addContent(fragment);
125                }
126    
127                if (LOGGER.isDebugEnabled()) {
128                    LOGGER.debug("---------- Stylesheet for \"" + index + "\" / " + source + " ----------");
129                    XMLOutputter out = new XMLOutputter(org.jdom.output.Format.getPrettyFormat());
130                    LOGGER.debug("\n" + out.outputString(root));
131                }
132    
133                try {
134                    stylesheet = factory.newTemplates(new JDOMSource(new Document(root)));
135                } catch (TransformerConfigurationException exc) {
136                    String msg = "Error while compiling XSL stylesheet: " + exc.getMessageAndLocation();
137                    throw new MCRConfigurationException(msg, exc);
138                }
139                stylesheets.put(key, stylesheet);
140            }
141    
142            return stylesheet;
143        }
144    
145        /**
146         * Extracts field values for indexing from the given MCRObject's metadata.
147         * 
148         * @param obj
149         *            the MCRObject thats metadata should be indexed
150         * @param index
151         *            the ID of the index as defined in searchfields.xml
152         * @return a List of MCRFieldValue objects that contain field and value
153         */
154        public static List<MCRFieldValue> buildFields(MCRObject obj, String index) {
155            String source = MCRFieldDef.OBJECT_METADATA + " " + MCRFieldDef.OBJECT_CATEGORY;
156            Templates stylesheet = buildStylesheet(index, source);
157            Document xml = obj.createXML();
158            return buildValues(stylesheet, xml, obj.getId().getTypeId());
159        }
160    
161        /**
162         * Extracts field values for indexing from the given MCRFile's metadata, xml
163         * content or text content.
164         * 
165         * @param file
166         *            the MCRFile thats data should be indexed
167         * @param index
168         *            the ID of the index as defined in searchfields.xml
169         * @return a List of MCRFieldValue objects that contain field and value
170         */
171        public static List<MCRFieldValue> buildFields(MCRFile file, String index) {
172            List<MCRFieldValue> values = new ArrayList<MCRFieldValue>();
173    
174            boolean foundSourceXMLContent = false;
175            boolean foundSourceFileMetadata = false;
176            boolean foundSourceFileAdditional = false;
177    
178            // Handle source FILE_TEXT_CONTENT
179            LOGGER.debug("Handle source FILE_TEXT_CONTENT");
180            List<MCRFieldDef> fieldDefList = MCRFieldDef.getFieldDefs(index);
181            for (MCRFieldDef fieldDef : fieldDefList) {
182                if (!fieldDef.isUsedForObjectType(file.getContentTypeID()))
183                    continue;
184    
185                if (MCRFieldDef.FILE_TEXT_CONTENT.equals(fieldDef.getSource()))
186                    values.add(new MCRFieldValue(fieldDef, file));
187    
188                if (MCRFieldDef.FILE_XML_CONTENT.equals(fieldDef.getSource()))
189                    foundSourceXMLContent = true;
190                if (MCRFieldDef.FILE_METADATA.equals(fieldDef.getSource()))
191                    foundSourceFileMetadata = true;
192                if (MCRFieldDef.FILE_ADDITIONAL_DATA.equals(fieldDef.getSource()))
193                    foundSourceFileAdditional = true;
194            }
195    
196            // Handle source FILE_XML_CONTENT
197            if (foundSourceXMLContent) {
198                LOGGER.debug("Handle source FILE_XML_CONTENT");
199                Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_XML_CONTENT);
200                Document xml = null;
201                try {
202                    xml = file.getContentAsJDOM();
203                } catch (Exception ex) {
204                    String msg = "Exception while building XML content of MCRFile " + file.getOwnerID() + " " + file.getAbsolutePath();
205                    LOGGER.error(msg, ex);
206                }
207                if (xml != null)
208                    values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
209            }
210    
211            // Handle source FILE_METADATA
212            if (foundSourceFileMetadata) {
213                LOGGER.debug("Handle source FILE_METADATA");
214                Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_METADATA);
215                Document xml = file.createXML();
216                values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
217            }
218    
219            // Handle source FILE_ADDITIONAL_DATA
220            if (foundSourceFileAdditional) {
221                LOGGER.debug("Handle source FILE_ADDITIONAL_DATA");
222                Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_ADDITIONAL_DATA);
223                Document xml = null;
224                try {
225                    xml = file.getAllAdditionalData();
226                } catch (Exception ex) {
227                    String msg = "Exception while reading additional XML data of MCRFile " + file.getOwnerID() + " " + file.getAbsolutePath();
228                    LOGGER.error(msg, ex);
229                }
230                if (xml != null)
231                    values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
232            }
233    
234            return values;
235        }
236    
237        /**
238         * Extracts field values for indexing from the given JDOM xml document.
239         * 
240         * @param doc
241         *            the JDOM xml document thats data should be indexed
242         * @param index
243         *            the ID of the index as defined in searchfields.xml
244         * @return a List of MCRFieldValue objects that contain name, type and value
245         */
246        public static List<MCRFieldValue> buildFields(Document doc, String index) {
247            Templates stylesheet = buildStylesheet(index, MCRFieldDef.XML);
248            return buildValues(stylesheet, doc, doc.getRootElement().getName());
249        }
250    
251        /**
252         * Extracts field values for indexing from the given JDOM xml document.
253         * 
254         * @param xml
255         *            the xml document thats data should be indexed as byte array
256         * @param index
257         *            the ID of the index as defined in searchfields.xml
258         * @return a List of MCRFieldValue objects that contain name, type and value
259         */
260        public static List<MCRFieldValue> buildFields(byte[] xml, String index, String source, String objectType) {
261            Templates stylesheet = buildStylesheet(index, source);
262            return buildValues(stylesheet, xml, objectType);
263        }
264    
265        /** Transforms xml input to search field values using XSL * */
266        private static List<MCRFieldValue> buildValues(Templates stylesheet, Document xml, String objectType) {
267            return buildValues(stylesheet, new JDOMSource(xml), objectType);
268        }
269    
270        /** Transforms xml input to search field values using XSL * */
271        private static List<MCRFieldValue> buildValues(Templates stylesheet, byte[] xml, String objectType) {
272            return buildValues(stylesheet, new StreamSource(new ByteArrayInputStream(xml)), objectType);
273        }
274    
275        /** Transforms xml input to search field values using XSL * */
276        private static List<MCRFieldValue> buildValues(Templates stylesheet, Source xml, String objectType) {
277            List<MCRFieldValue> values = new ArrayList<MCRFieldValue>();
278    
279            List fieldValues = null;
280            try {
281                JDOMResult xmlres = new JDOMResult();
282                Transformer transformer = factory.newTransformerHandler(stylesheet).getTransformer();
283                transformer.setParameter("objectType", objectType);
284                transformer.transform(xml, xmlres);
285    
286                List resultList = xmlres.getResult();
287                Element root = (Element) (resultList.get(0));
288                fieldValues = root.getChildren();
289            } catch (Exception ex) {
290                String msg = "Exception while transforming metadata to search field";
291                throw new MCRException(msg, ex);
292            }
293    
294            if (fieldValues != null)
295                for (int i = 0; i < fieldValues.size(); i++) {
296                    Element fieldValue = (Element) (fieldValues.get(i));
297                    String value = fieldValue.getTextTrim();
298                    String name = fieldValue.getName();
299                    MCRFieldDef def = MCRFieldDef.getDef(name);
300    
301                    if ((value != null) && (value.length() > 0)) {
302                        LOGGER.debug("MCRData2Fields " + name + " := " + value);
303                        values.add(new MCRFieldValue(def, value));
304                    }
305                }
306            return values;
307        }
308    
309        /**
310         * Xalan XSL extension to convert MyCoRe date values to standard format. To
311         * be used in a stylesheet or searchfields.xml configuration. Usage example:
312         * &lt;field name="date" type="date"
313         * xpath="/mycoreobject/metadata/dates/date"
314         * value="ext:normalizeDate(string(text()))" &gt;
315         * 
316         * @param date
317         *            the date string in a locale-dependent format
318         */
319        public static String normalizeDate(String sDate) {
320            try {
321                MCRMetaISO8601Date iDate = new MCRMetaISO8601Date();
322                iDate.setDate(sDate.trim());
323                return iDate.getISOString().substring(0, 10);
324            } catch (Exception ex) {
325                LOGGER.debug(ex);
326                return "";
327            }
328        }
329    }