001 /*
002 *
003 * $Revision: 14944 $ $Date: 2009-03-18 12:19:32 +0100 (Wed, 18 Mar 2009) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.services.fieldquery;
025
026 import java.io.ByteArrayInputStream;
027 import java.util.ArrayList;
028 import java.util.List;
029
030 import javax.xml.transform.Source;
031 import javax.xml.transform.Templates;
032 import javax.xml.transform.Transformer;
033 import javax.xml.transform.TransformerConfigurationException;
034 import javax.xml.transform.TransformerFactory;
035 import javax.xml.transform.sax.SAXTransformerFactory;
036 import javax.xml.transform.stream.StreamSource;
037
038 import org.apache.log4j.Logger;
039 import org.jdom.Document;
040 import org.jdom.Element;
041 import org.jdom.Namespace;
042 import org.jdom.output.XMLOutputter;
043 import org.jdom.transform.JDOMResult;
044 import org.jdom.transform.JDOMSource;
045 import org.mycore.common.MCRCache;
046 import org.mycore.common.MCRConfigurationException;
047 import org.mycore.common.MCRConstants;
048 import org.mycore.common.MCRException;
049 import org.mycore.common.xml.MCRURIResolver;
050 import org.mycore.datamodel.ifs.MCRFile;
051 import org.mycore.datamodel.metadata.MCRMetaISO8601Date;
052 import org.mycore.datamodel.metadata.MCRObject;
053
054 /**
055 * Provides methods to automatically extract field values for indexing from
056 * MCRObject, MCRFile or any XML document using the definition in
057 * searchfields.xml. The buildFields method returns a list of MCRFieldValue
058 * objects with values extracted from the object for the given search index.
059 * This class supports extracting values from MCRObject metadata, MCRFile
060 * metadata, MCRFile xml content. MCRFile additional data, MCRFile text content
061 * using the text filter plug-ins, and any plain XML document.
062 *
063 * @see MCRSearcher#addToIndex(String, List)
064 * @author Frank L�tzenkirchen
065 */
066 public class MCRData2Fields {
067
068 /** The logger */
069 private static final Logger LOGGER = Logger.getLogger(MCRData2Fields.class);
070
071 /** The XSL transformer factory to use */
072 private static SAXTransformerFactory factory;
073
074 /** A template element to be used for building individual stylesheet */
075 private static Element xslTemplate;
076
077 private static MCRCache stylesheets = new MCRCache(20, "data2searchfields stylesheets");
078
079 static {
080 TransformerFactory tf = TransformerFactory.newInstance();
081 if (!tf.getFeature(SAXTransformerFactory.FEATURE)) {
082 throw new MCRConfigurationException("Could not load a SAXTransformerFactory for use with XSLT");
083 }
084
085 factory = (SAXTransformerFactory) (tf);
086 factory.setURIResolver(MCRURIResolver.instance());
087
088 xslTemplate = new Element("stylesheet");
089 xslTemplate.setAttribute("version", "1.0");
090 xslTemplate.setNamespace(MCRConstants.XSL_NAMESPACE);
091 xslTemplate.addNamespaceDeclaration(Namespace.XML_NAMESPACE);
092 xslTemplate.addNamespaceDeclaration(MCRConstants.XLINK_NAMESPACE);
093 xslTemplate.addNamespaceDeclaration(MCRFieldDef.xalanns);
094 xslTemplate.addNamespaceDeclaration(MCRFieldDef.extns);
095 xslTemplate.setAttribute("extension-element-prefixes", "ext");
096
097 Element param = new Element("param", MCRConstants.XSL_NAMESPACE);
098 param.setAttribute("name", "objectType");
099 xslTemplate.addContent(param);
100
101 Element template = new Element("template", MCRConstants.XSL_NAMESPACE);
102 template.setAttribute("match", "/");
103 xslTemplate.addContent(template);
104
105 Element fieldValues = new Element("fieldValues", MCRConstants.MCR_NAMESPACE);
106 template.addContent(fieldValues);
107 }
108
109 private static Templates buildStylesheet(String index, String source) {
110 String key = index + "//" + source;
111 Templates stylesheet = (Templates) (stylesheets.get(key));
112
113 if (stylesheet == null) {
114 Element root = (Element) (xslTemplate.clone());
115 Element fv = root.getChild("template", MCRConstants.XSL_NAMESPACE).getChild("fieldValues", MCRConstants.MCR_NAMESPACE);
116
117 List<MCRFieldDef> fieldDefs = MCRFieldDef.getFieldDefs(index);
118 for (int i = 0; i < fieldDefs.size(); i++) {
119 MCRFieldDef fieldDef = fieldDefs.get(i);
120 if (source.indexOf(fieldDef.getSource()) == -1)
121 continue;
122 Element fragment = fieldDef.getXSL();
123 if (fragment != null)
124 fv.addContent(fragment);
125 }
126
127 if (LOGGER.isDebugEnabled()) {
128 LOGGER.debug("---------- Stylesheet for \"" + index + "\" / " + source + " ----------");
129 XMLOutputter out = new XMLOutputter(org.jdom.output.Format.getPrettyFormat());
130 LOGGER.debug("\n" + out.outputString(root));
131 }
132
133 try {
134 stylesheet = factory.newTemplates(new JDOMSource(new Document(root)));
135 } catch (TransformerConfigurationException exc) {
136 String msg = "Error while compiling XSL stylesheet: " + exc.getMessageAndLocation();
137 throw new MCRConfigurationException(msg, exc);
138 }
139 stylesheets.put(key, stylesheet);
140 }
141
142 return stylesheet;
143 }
144
145 /**
146 * Extracts field values for indexing from the given MCRObject's metadata.
147 *
148 * @param obj
149 * the MCRObject thats metadata should be indexed
150 * @param index
151 * the ID of the index as defined in searchfields.xml
152 * @return a List of MCRFieldValue objects that contain field and value
153 */
154 public static List<MCRFieldValue> buildFields(MCRObject obj, String index) {
155 String source = MCRFieldDef.OBJECT_METADATA + " " + MCRFieldDef.OBJECT_CATEGORY;
156 Templates stylesheet = buildStylesheet(index, source);
157 Document xml = obj.createXML();
158 return buildValues(stylesheet, xml, obj.getId().getTypeId());
159 }
160
161 /**
162 * Extracts field values for indexing from the given MCRFile's metadata, xml
163 * content or text content.
164 *
165 * @param file
166 * the MCRFile thats data should be indexed
167 * @param index
168 * the ID of the index as defined in searchfields.xml
169 * @return a List of MCRFieldValue objects that contain field and value
170 */
171 public static List<MCRFieldValue> buildFields(MCRFile file, String index) {
172 List<MCRFieldValue> values = new ArrayList<MCRFieldValue>();
173
174 boolean foundSourceXMLContent = false;
175 boolean foundSourceFileMetadata = false;
176 boolean foundSourceFileAdditional = false;
177
178 // Handle source FILE_TEXT_CONTENT
179 LOGGER.debug("Handle source FILE_TEXT_CONTENT");
180 List<MCRFieldDef> fieldDefList = MCRFieldDef.getFieldDefs(index);
181 for (MCRFieldDef fieldDef : fieldDefList) {
182 if (!fieldDef.isUsedForObjectType(file.getContentTypeID()))
183 continue;
184
185 if (MCRFieldDef.FILE_TEXT_CONTENT.equals(fieldDef.getSource()))
186 values.add(new MCRFieldValue(fieldDef, file));
187
188 if (MCRFieldDef.FILE_XML_CONTENT.equals(fieldDef.getSource()))
189 foundSourceXMLContent = true;
190 if (MCRFieldDef.FILE_METADATA.equals(fieldDef.getSource()))
191 foundSourceFileMetadata = true;
192 if (MCRFieldDef.FILE_ADDITIONAL_DATA.equals(fieldDef.getSource()))
193 foundSourceFileAdditional = true;
194 }
195
196 // Handle source FILE_XML_CONTENT
197 if (foundSourceXMLContent) {
198 LOGGER.debug("Handle source FILE_XML_CONTENT");
199 Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_XML_CONTENT);
200 Document xml = null;
201 try {
202 xml = file.getContentAsJDOM();
203 } catch (Exception ex) {
204 String msg = "Exception while building XML content of MCRFile " + file.getOwnerID() + " " + file.getAbsolutePath();
205 LOGGER.error(msg, ex);
206 }
207 if (xml != null)
208 values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
209 }
210
211 // Handle source FILE_METADATA
212 if (foundSourceFileMetadata) {
213 LOGGER.debug("Handle source FILE_METADATA");
214 Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_METADATA);
215 Document xml = file.createXML();
216 values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
217 }
218
219 // Handle source FILE_ADDITIONAL_DATA
220 if (foundSourceFileAdditional) {
221 LOGGER.debug("Handle source FILE_ADDITIONAL_DATA");
222 Templates stylesheet = buildStylesheet(index, MCRFieldDef.FILE_ADDITIONAL_DATA);
223 Document xml = null;
224 try {
225 xml = file.getAllAdditionalData();
226 } catch (Exception ex) {
227 String msg = "Exception while reading additional XML data of MCRFile " + file.getOwnerID() + " " + file.getAbsolutePath();
228 LOGGER.error(msg, ex);
229 }
230 if (xml != null)
231 values.addAll(buildValues(stylesheet, xml, file.getContentTypeID()));
232 }
233
234 return values;
235 }
236
237 /**
238 * Extracts field values for indexing from the given JDOM xml document.
239 *
240 * @param doc
241 * the JDOM xml document thats data should be indexed
242 * @param index
243 * the ID of the index as defined in searchfields.xml
244 * @return a List of MCRFieldValue objects that contain name, type and value
245 */
246 public static List<MCRFieldValue> buildFields(Document doc, String index) {
247 Templates stylesheet = buildStylesheet(index, MCRFieldDef.XML);
248 return buildValues(stylesheet, doc, doc.getRootElement().getName());
249 }
250
251 /**
252 * Extracts field values for indexing from the given JDOM xml document.
253 *
254 * @param xml
255 * the xml document thats data should be indexed as byte array
256 * @param index
257 * the ID of the index as defined in searchfields.xml
258 * @return a List of MCRFieldValue objects that contain name, type and value
259 */
260 public static List<MCRFieldValue> buildFields(byte[] xml, String index, String source, String objectType) {
261 Templates stylesheet = buildStylesheet(index, source);
262 return buildValues(stylesheet, xml, objectType);
263 }
264
265 /** Transforms xml input to search field values using XSL * */
266 private static List<MCRFieldValue> buildValues(Templates stylesheet, Document xml, String objectType) {
267 return buildValues(stylesheet, new JDOMSource(xml), objectType);
268 }
269
270 /** Transforms xml input to search field values using XSL * */
271 private static List<MCRFieldValue> buildValues(Templates stylesheet, byte[] xml, String objectType) {
272 return buildValues(stylesheet, new StreamSource(new ByteArrayInputStream(xml)), objectType);
273 }
274
275 /** Transforms xml input to search field values using XSL * */
276 private static List<MCRFieldValue> buildValues(Templates stylesheet, Source xml, String objectType) {
277 List<MCRFieldValue> values = new ArrayList<MCRFieldValue>();
278
279 List fieldValues = null;
280 try {
281 JDOMResult xmlres = new JDOMResult();
282 Transformer transformer = factory.newTransformerHandler(stylesheet).getTransformer();
283 transformer.setParameter("objectType", objectType);
284 transformer.transform(xml, xmlres);
285
286 List resultList = xmlres.getResult();
287 Element root = (Element) (resultList.get(0));
288 fieldValues = root.getChildren();
289 } catch (Exception ex) {
290 String msg = "Exception while transforming metadata to search field";
291 throw new MCRException(msg, ex);
292 }
293
294 if (fieldValues != null)
295 for (int i = 0; i < fieldValues.size(); i++) {
296 Element fieldValue = (Element) (fieldValues.get(i));
297 String value = fieldValue.getTextTrim();
298 String name = fieldValue.getName();
299 MCRFieldDef def = MCRFieldDef.getDef(name);
300
301 if ((value != null) && (value.length() > 0)) {
302 LOGGER.debug("MCRData2Fields " + name + " := " + value);
303 values.add(new MCRFieldValue(def, value));
304 }
305 }
306 return values;
307 }
308
309 /**
310 * Xalan XSL extension to convert MyCoRe date values to standard format. To
311 * be used in a stylesheet or searchfields.xml configuration. Usage example:
312 * <field name="date" type="date"
313 * xpath="/mycoreobject/metadata/dates/date"
314 * value="ext:normalizeDate(string(text()))" >
315 *
316 * @param date
317 * the date string in a locale-dependent format
318 */
319 public static String normalizeDate(String sDate) {
320 try {
321 MCRMetaISO8601Date iDate = new MCRMetaISO8601Date();
322 iDate.setDate(sDate.trim());
323 return iDate.getISOString().substring(0, 10);
324 } catch (Exception ex) {
325 LOGGER.debug(ex);
326 return "";
327 }
328 }
329 }