001 /*
002 *
003 * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.services.plugins;
025
026 import java.io.ByteArrayInputStream;
027 import java.io.ByteArrayOutputStream;
028 import java.io.InputStream;
029 import java.io.Reader;
030 import java.io.StringReader;
031 import java.util.HashSet;
032 import java.util.List;
033
034 import org.apache.log4j.Logger;
035 import org.jdom.Comment;
036 import org.jdom.Element;
037 import org.jdom.Text;
038 import org.jdom.input.SAXBuilder;
039 import org.jdom.output.XMLOutputter;
040 import org.mycore.common.MCRException;
041 import org.mycore.datamodel.ifs.MCRFileContentType;
042 import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
043 import org.w3c.tidy.Tidy;
044
045 /**
046 * Converts XML, XTHML and HTML to plain text for indexing
047 *
048 * @author Frank Lützenkirchen
049 * @author Harald Richter
050 */
051 public class XmlHtmlPlugin implements TextFilterPlugin {
052 /** The logger */
053 private static final Logger LOGGER = Logger.getLogger(XmlHtmlPlugin.class);
054
055 private static final int MAJOR = 1;
056
057 private static final int MINOR = 0;
058
059 private static HashSet contentTypes;
060
061 private static String info = null;
062
063 public XmlHtmlPlugin() {
064 super();
065
066 if (contentTypes == null) {
067 contentTypes = new HashSet();
068
069 if (MCRFileContentTypeFactory.isTypeAvailable("xml")) {
070 contentTypes.add(MCRFileContentTypeFactory.getType("xml"));
071 }
072
073 if (MCRFileContentTypeFactory.isTypeAvailable("html")) {
074 contentTypes.add(MCRFileContentTypeFactory.getType("html"));
075 }
076 }
077
078 if (info == null) {
079 info = new StringBuffer("This filter converts XML, XTHML and HTML to plain text").toString();
080 }
081 }
082
083 /**
084 * @see org.mycore.services.plugins.TextFilterPlugin#getName()
085 */
086 public String getName() {
087 return "hfwri's and fluetze's amazing xml and html Filter";
088 }
089
090 /**
091 * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
092 */
093 public String getInfo() {
094 return info;
095 }
096
097 /**
098 * @see org.mycore.services.plugins.XmlHtmlPlugin#getSupportedContentTypes()
099 */
100 public HashSet getSupportedContentTypes() {
101 return contentTypes;
102 }
103
104 /**
105 * @see org.mycore.services.plugins.TextFilterPlugin#transform(org.mycore.datamodel.ifs.MCRFileContentType,org.mycore.datamodel.ifs.MCRContentInputStream,
106 * java.io.OutputStream)
107 */
108 public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
109 if (getSupportedContentTypes().contains(ct)) {
110 String tx = getFullText(ct, input);
111
112 return new StringReader(tx);
113 }
114 throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
115 }
116
117 /**
118 * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
119 */
120 public int getMajorNumber() {
121 return MAJOR;
122 }
123
124 /**
125 * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
126 */
127 public int getMinorNumber() {
128 return MINOR;
129 }
130
131 private static String getFullText(MCRFileContentType ct, InputStream input) {
132 try {
133 if (ct.getID().equals("xml")) {
134 org.jdom.input.SAXBuilder builder = new org.jdom.input.SAXBuilder();
135
136 return getText(builder.build(input)); // file.getContentAsJDOM()
137 } else if (ct.getID().equals("html")) {
138 org.jdom.Document xml = tidy(input);
139 return (xml == null ? "" : getText(xml));
140 } else {
141 return null;
142 }
143 } catch (Exception ex) {
144 ex.printStackTrace();
145
146 return null;
147 }
148 }
149
150 /** Converts HTML string to XML to be able to extract text nodes * */
151 public static String getFullText(String html) {
152 org.jdom.Document xml = tidy(new ByteArrayInputStream(html.getBytes()));
153 if (xml == null)
154 return null;
155 else
156 return getText(xml);
157 }
158
159 /** Converts HTML files to XML to be able to extract text nodes * */
160 private static org.jdom.Document tidy(InputStream input) {
161 Tidy tidy = new Tidy();
162 tidy.setForceOutput(true);
163 tidy.setFixComments(true);
164 tidy.setHideEndTags(false);
165 tidy.setQuiet(!LOGGER.isDebugEnabled());
166 tidy.setShowWarnings(LOGGER.isDebugEnabled());
167 tidy.setXmlOut(true);
168 tidy.setXmlTags(false);
169 tidy.setPrintBodyOnly(true);
170 tidy.setNumEntities(true);
171
172 try {
173 ByteArrayOutputStream baos = new ByteArrayOutputStream();
174 baos.write("<html><body>".getBytes());
175 tidy.parseDOM(input, baos);
176 baos.write("</body></html>".getBytes());
177 baos.close();
178 byte[] bytes = baos.toByteArray();
179 LOGGER.debug("------ after JTidy: ------");
180 LOGGER.debug(new String(bytes, tidy.getOutputEncoding()));
181 ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
182 SAXBuilder builder = new SAXBuilder();
183 builder.setExpandEntities(false);
184 builder.setValidation(false);
185 org.jdom.Document jdoc = builder.build(bais);
186 return jdoc;
187 } catch (Exception ex) {
188 LOGGER.info("Exception while tidying HTML to XML: " + ex.getClass().getName() + ": " + ex.getMessage());
189 LOGGER.debug(MCRException.getStackTraceAsString(ex));
190 return null;
191 }
192 }
193
194 /** Extracts text of text nodes and comment nodes from xml files * */
195 private static String getText(org.jdom.Document xml) {
196 StringBuffer buffer = new StringBuffer();
197 xml2txt(buffer, xml.getContent());
198 LOGGER.debug("------ after xml2txt ------" );
199 LOGGER.debug(buffer.toString());
200 return buffer.toString();
201 }
202
203 /** Extracts text of text nodes and comment nodes from xml files * */
204 private static void xml2txt(StringBuffer buffer, List content) {
205 for (int i = 0; (content != null) && (i < content.size()); i++) {
206 Object obj = content.get(i);
207
208 if (obj instanceof Element) {
209 Element elem = (Element) obj;
210 xml2txt(buffer, elem.getContent());
211 } else if (obj instanceof Text) {
212 Text text = (Text) obj;
213 buffer.append(text.getTextTrim()).append("\n\n");
214 } else if (obj instanceof Comment) {
215 Comment comm = (Comment) obj;
216 buffer.append(comm.getText()).append("\n\n");
217 }
218 }
219 }
220 }