001    /*
002     * 
003     * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.services.plugins;
025    
026    import java.io.ByteArrayInputStream;
027    import java.io.ByteArrayOutputStream;
028    import java.io.InputStream;
029    import java.io.Reader;
030    import java.io.StringReader;
031    import java.util.HashSet;
032    import java.util.List;
033    
034    import org.apache.log4j.Logger;
035    import org.jdom.Comment;
036    import org.jdom.Element;
037    import org.jdom.Text;
038    import org.jdom.input.SAXBuilder;
039    import org.jdom.output.XMLOutputter;
040    import org.mycore.common.MCRException;
041    import org.mycore.datamodel.ifs.MCRFileContentType;
042    import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
043    import org.w3c.tidy.Tidy;
044    
045    /**
046     * Converts XML, XTHML and HTML to plain text for indexing
047     * 
048     * @author Frank Lützenkirchen
049     * @author Harald Richter
050     */
051    public class XmlHtmlPlugin implements TextFilterPlugin {
052        /** The logger */
053        private static final Logger LOGGER = Logger.getLogger(XmlHtmlPlugin.class);
054    
055        private static final int MAJOR = 1;
056    
057        private static final int MINOR = 0;
058    
059        private static HashSet contentTypes;
060    
061        private static String info = null;
062    
063        public XmlHtmlPlugin() {
064            super();
065    
066            if (contentTypes == null) {
067                contentTypes = new HashSet();
068    
069                if (MCRFileContentTypeFactory.isTypeAvailable("xml")) {
070                    contentTypes.add(MCRFileContentTypeFactory.getType("xml"));
071                }
072    
073                if (MCRFileContentTypeFactory.isTypeAvailable("html")) {
074                    contentTypes.add(MCRFileContentTypeFactory.getType("html"));
075                }
076            }
077    
078            if (info == null) {
079                info = new StringBuffer("This filter converts XML, XTHML and HTML to plain text").toString();
080            }
081        }
082    
083        /**
084         * @see org.mycore.services.plugins.TextFilterPlugin#getName()
085         */
086        public String getName() {
087            return "hfwri's and fluetze's amazing xml and html Filter";
088        }
089    
090        /**
091         * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
092         */
093        public String getInfo() {
094            return info;
095        }
096    
097        /**
098         * @see org.mycore.services.plugins.XmlHtmlPlugin#getSupportedContentTypes()
099         */
100        public HashSet getSupportedContentTypes() {
101            return contentTypes;
102        }
103    
104        /**
105         * @see org.mycore.services.plugins.TextFilterPlugin#transform(org.mycore.datamodel.ifs.MCRFileContentType,org.mycore.datamodel.ifs.MCRContentInputStream,
106         *      java.io.OutputStream)
107         */
108        public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
109            if (getSupportedContentTypes().contains(ct)) {
110                String tx = getFullText(ct, input);
111    
112                return new StringReader(tx);
113            }
114            throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
115        }
116    
117        /**
118         * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
119         */
120        public int getMajorNumber() {
121            return MAJOR;
122        }
123    
124        /**
125         * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
126         */
127        public int getMinorNumber() {
128            return MINOR;
129        }
130    
131        private static String getFullText(MCRFileContentType ct, InputStream input) {
132            try {
133                if (ct.getID().equals("xml")) {
134                    org.jdom.input.SAXBuilder builder = new org.jdom.input.SAXBuilder();
135    
136                    return getText(builder.build(input)); // file.getContentAsJDOM()
137                } else if (ct.getID().equals("html")) {
138                    org.jdom.Document xml = tidy(input);
139                    return (xml == null ? "" : getText(xml));
140                } else {
141                    return null;
142                }
143            } catch (Exception ex) {
144                ex.printStackTrace();
145    
146                return null;
147            }
148        }
149    
150        /** Converts HTML string to XML to be able to extract text nodes * */
151        public static String getFullText(String html) {
152            org.jdom.Document xml = tidy(new ByteArrayInputStream(html.getBytes()));
153            if (xml == null)
154                return null;
155            else
156                return getText(xml);
157        }
158    
159        /** Converts HTML files to XML to be able to extract text nodes * */
160        private static org.jdom.Document tidy(InputStream input) {
161            Tidy tidy = new Tidy();
162            tidy.setForceOutput(true);
163            tidy.setFixComments(true);
164            tidy.setHideEndTags(false);
165            tidy.setQuiet(!LOGGER.isDebugEnabled());
166            tidy.setShowWarnings(LOGGER.isDebugEnabled());
167            tidy.setXmlOut(true);
168            tidy.setXmlTags(false);
169            tidy.setPrintBodyOnly(true);
170            tidy.setNumEntities(true);
171    
172            try {
173                ByteArrayOutputStream baos = new ByteArrayOutputStream();
174                baos.write("<html><body>".getBytes());
175                tidy.parseDOM(input, baos);
176                baos.write("</body></html>".getBytes());
177                baos.close();
178                byte[] bytes = baos.toByteArray();
179                LOGGER.debug("------ after JTidy: ------");
180                LOGGER.debug(new String(bytes, tidy.getOutputEncoding()));
181                ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
182                SAXBuilder builder = new SAXBuilder();
183                builder.setExpandEntities(false);
184                builder.setValidation(false);
185                org.jdom.Document jdoc = builder.build(bais);
186                return jdoc;
187            } catch (Exception ex) {
188                LOGGER.info("Exception while tidying HTML to XML: " + ex.getClass().getName() + ": " + ex.getMessage());
189                LOGGER.debug(MCRException.getStackTraceAsString(ex));
190                return null;
191            }
192        }
193    
194        /** Extracts text of text nodes and comment nodes from xml files * */
195        private static String getText(org.jdom.Document xml) {
196            StringBuffer buffer = new StringBuffer();
197            xml2txt(buffer, xml.getContent());
198            LOGGER.debug("------ after xml2txt ------" );
199            LOGGER.debug(buffer.toString());
200            return buffer.toString();
201        }
202    
203        /** Extracts text of text nodes and comment nodes from xml files * */
204        private static void xml2txt(StringBuffer buffer, List content) {
205            for (int i = 0; (content != null) && (i < content.size()); i++) {
206                Object obj = content.get(i);
207    
208                if (obj instanceof Element) {
209                    Element elem = (Element) obj;
210                    xml2txt(buffer, elem.getContent());
211                } else if (obj instanceof Text) {
212                    Text text = (Text) obj;
213                    buffer.append(text.getTextTrim()).append("\n\n");
214                } else if (obj instanceof Comment) {
215                    Comment comm = (Comment) obj;
216                    buffer.append(comm.getText()).append("\n\n");
217                }
218            }
219        }
220    }