001    /*
002     * 
003     * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.services.plugins;
025    
026    import java.io.BufferedOutputStream;
027    import java.io.ByteArrayInputStream;
028    import java.io.ByteArrayOutputStream;
029    import java.io.IOException;
030    import java.io.InputStream;
031    import java.io.Reader;
032    import java.io.StringReader;
033    import java.util.HashSet;
034    import java.util.zip.ZipEntry;
035    import java.util.zip.ZipInputStream;
036    
037    import org.mycore.common.MCRUtils;
038    import org.mycore.datamodel.ifs.MCRFileContentType;
039    import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
040    import org.xml.sax.Attributes;
041    import org.xml.sax.EntityResolver;
042    import org.xml.sax.InputSource;
043    import org.xml.sax.SAXException;
044    import org.xml.sax.XMLReader;
045    import org.xml.sax.helpers.DefaultHandler;
046    import org.xml.sax.helpers.XMLReaderFactory;
047    
048    /**
049     * @author Thomas Scheffler (yagee)
050     * 
051     * Need to insert some things here
052     * 
053     */
054    abstract class OpenOfficeBasePlugin implements TextFilterPlugin {
055        private static final EntityResolver OooResolver = new ResolveOfficeDTD();
056    
057        private static final String SAXparser = "org.apache.xerces.parsers.SAXParser";
058    
059        private HashSet<MCRFileContentType> contentTypes;
060    
061        private static int DEF_BYTE_SZ = 1024 * 63;
062    
063        /**
064         * 
065         */
066        OpenOfficeBasePlugin(String contentType) {
067            super();
068    
069            contentTypes = new HashSet<MCRFileContentType>();
070    
071            if (MCRFileContentTypeFactory.isTypeAvailable(contentType)) {
072                contentTypes.add(MCRFileContentTypeFactory.getType(contentType));
073            }
074    
075            try {
076                Class.forName(SAXparser);
077            } catch (ClassNotFoundException e) {
078                throw new FilterPluginInstantiationException(new StringBuilder("This Plugin is only tested with Xerces").append(
079                        "(http://xml.apache.org/xerces2-j/index.html) and").append(" though requires it to be installed somewhere in").append(
080                        " CLASSPATH. Please ensure that a jar file ").append(" containing the class ").append(SAXparser).append(
081                        " is listed in a CLASSPATH before running your").append(" brandnew MyCoRe(tm)-Application.\n").append(
082                        " I as a developer of cause know that Xerces is").append(" bundled with every MyCoRe(tm) release and thus").append(
083                        " you will never read this message.\n").append(" But just in case, I thought it is a good idea to").append(" implement this message here.")
084                        .toString());
085            }
086        }
087    
088        /*
089         * (non-Javadoc)
090         * 
091         * @see org.mycore.services.plugins.TextFilterPlugin#getName()
092         */
093        abstract public String getName();
094    
095        /*
096         * (non-Javadoc)
097         * 
098         * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
099         */
100        abstract public String getInfo();
101    
102        /*
103         * (non-Javadoc)
104         * 
105         * @see org.mycore.services.plugins.TextFilterPlugin#getSupportedContentTypes()
106         */
107        public HashSet getSupportedContentTypes() {
108            return contentTypes;
109        }
110    
111        /*
112         * (non-Javadoc)
113         * 
114         * @see org.mycore.services.plugins.TextFilterPlugin#transform(org.mycore.datamodel.ifs.MCRFileContentType,org.mycore.datamodel.ifs.MCRContentInputStream,
115         *      java.io.OutputStream)
116         */
117        public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
118            if (getSupportedContentTypes().contains(ct)) {
119                try {
120                    System.out.println("Reading "+getDocumentName());
121    
122                    return getTextReader(getXMLStream(input));
123                } catch (SAXException e) {
124                    throw new FilterPluginTransformException("Error while parsing "+getDocumentName(), e);
125                } catch (IOException e) {
126                    throw new FilterPluginTransformException("Error while parsing "+getDocumentName(), e);
127                }
128            }
129            throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
130        }
131    
132        /**
133         * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
134         */
135        abstract public int getMajorNumber();
136    
137        /**
138         * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
139         */
140        abstract public int getMinorNumber();
141    
142        static InputStream getXMLStream(InputStream inp) throws IOException {
143            ZipInputStream zip = new ZipInputStream(inp);
144            ZipEntry ze;
145    
146            // search for "content.xml" in ZipStream
147            while ((ze = zip.getNextEntry()) != null) {
148                if (ze.getName().equals("content.xml")) {
149                    break;
150                }
151            }
152    
153            if ((ze == null) || !ze.getName().equals("content.xml")) {
154                throw new FilterPluginTransformException("No content.xml was found in OpenOffice.org document!");
155            }
156    
157            int chunkSize = (ze.getSize() < 0) ? DEF_BYTE_SZ : (int) ze.getSize();
158            ByteArrayOutputStream bos = new ByteArrayOutputStream(chunkSize);
159            BufferedOutputStream out = new BufferedOutputStream(bos);
160            byte[] ba = new byte[chunkSize];
161    
162            while (true) {
163                int bytesRead = MCRUtils.readBlocking(zip, ba, 0, chunkSize);
164    
165                if (bytesRead > 0) {
166                    out.write(ba, 0 /* offset in ba */, bytesRead /*
167                                                                     * bytes to
168                                                                     * write
169                                                                     */);
170                } else {
171                    break; // hit eof
172                }
173            }
174    
175            out.close();
176    
177            return new ByteArrayInputStream(bos.toByteArray());
178        }
179    
180        private Reader getTextReader(InputStream xml) throws IOException, SAXException {
181            XMLReader reader = XMLReaderFactory.createXMLReader(SAXparser);
182            StringBuilder buf = new StringBuilder();
183            reader.setContentHandler(new TextHandler(buf,getTextNameSpace()));
184    
185            InputSource inp = new InputSource(xml);
186            reader.setEntityResolver(OooResolver);
187            reader.parse(inp);
188    
189            return new StringBuilderReader(buf);
190        }
191        
192        abstract String getTextNameSpace();
193        abstract String getDocumentName();
194    
195        private static class StringBuilderReader extends Reader {
196            private final StringBuilder buf;
197    
198            private int pos;
199    
200            public StringBuilderReader(StringBuilder buf) {
201                this.buf = buf;
202                pos = 0;
203            }
204    
205            /*
206             * (non-Javadoc)
207             * 
208             * @see java.io.Reader#close()
209             */
210            public void close() {
211            }
212    
213            /*
214             * (non-Javadoc)
215             * 
216             * @see java.io.Reader#read(char[], int, int)
217             */
218            public int read(char[] cbuf, int off, int len) {
219                if (pos == buf.length()) {
220                    return -1;
221                }
222                int start = pos + off;
223                int charsRead = (buf.length() < (start + len)) ? (buf.length() - start) : len;
224                int end = start + charsRead;
225                buf.getChars(start, end, cbuf, 0);
226                pos = end;
227    
228                return charsRead;
229            }
230        }
231    
232        private static class TextHandler extends DefaultHandler {
233            private String textNS;
234    
235            private final StringBuilder buf;
236    
237            private boolean textElement = false;
238            
239            TextHandler(StringBuilder buf,String textNameSpace) {
240                this.buf = buf;
241                this.textNS= textNameSpace;
242            }
243    
244            /*
245             * (non-Javadoc)
246             * 
247             * @see org.xml.sax.ContentHandler#characters(char[], int, int)
248             */
249            public void characters(char[] ch, int start, int length) {
250                if (textElement) {
251                    // write text to the stream
252                    buf.append(ch, start, length).append(' ');
253                }
254            }
255    
256            /*
257             * (non-Javadoc)
258             * 
259             * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
260             *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
261             */
262            public void startElement(String uri, String localName, String qName, Attributes attributes) {
263                // using internal optimized Strings of Xerces-J
264                if (uri == textNS) {
265                    textElement = true;
266                } else if (uri.equals(textNS)) {
267                    textElement = true;
268    
269                    // therefor we might need to assign a given uri to textNS
270                    textNS = uri;
271                } else {
272                    textElement = false;
273                }
274            }
275        }
276    
277        private static class ResolveOfficeDTD implements EntityResolver {
278            /**
279             * returns an empty dtd to parse Ooo documents we don't need them, since
280             * we validate them either
281             */
282            public InputSource resolveEntity(String publicId, String systemId) {
283                return new InputSource(new StringReader(" "));
284            }
285        }
286    }