001 /*
002 *
003 * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.services.plugins;
025
026 import java.io.BufferedOutputStream;
027 import java.io.ByteArrayInputStream;
028 import java.io.ByteArrayOutputStream;
029 import java.io.IOException;
030 import java.io.InputStream;
031 import java.io.Reader;
032 import java.io.StringReader;
033 import java.util.HashSet;
034 import java.util.zip.ZipEntry;
035 import java.util.zip.ZipInputStream;
036
037 import org.mycore.common.MCRUtils;
038 import org.mycore.datamodel.ifs.MCRFileContentType;
039 import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
040 import org.xml.sax.Attributes;
041 import org.xml.sax.EntityResolver;
042 import org.xml.sax.InputSource;
043 import org.xml.sax.SAXException;
044 import org.xml.sax.XMLReader;
045 import org.xml.sax.helpers.DefaultHandler;
046 import org.xml.sax.helpers.XMLReaderFactory;
047
048 /**
049 * @author Thomas Scheffler (yagee)
050 *
051 * Need to insert some things here
052 *
053 */
054 abstract class OpenOfficeBasePlugin implements TextFilterPlugin {
055 private static final EntityResolver OooResolver = new ResolveOfficeDTD();
056
057 private static final String SAXparser = "org.apache.xerces.parsers.SAXParser";
058
059 private HashSet<MCRFileContentType> contentTypes;
060
061 private static int DEF_BYTE_SZ = 1024 * 63;
062
063 /**
064 *
065 */
066 OpenOfficeBasePlugin(String contentType) {
067 super();
068
069 contentTypes = new HashSet<MCRFileContentType>();
070
071 if (MCRFileContentTypeFactory.isTypeAvailable(contentType)) {
072 contentTypes.add(MCRFileContentTypeFactory.getType(contentType));
073 }
074
075 try {
076 Class.forName(SAXparser);
077 } catch (ClassNotFoundException e) {
078 throw new FilterPluginInstantiationException(new StringBuilder("This Plugin is only tested with Xerces").append(
079 "(http://xml.apache.org/xerces2-j/index.html) and").append(" though requires it to be installed somewhere in").append(
080 " CLASSPATH. Please ensure that a jar file ").append(" containing the class ").append(SAXparser).append(
081 " is listed in a CLASSPATH before running your").append(" brandnew MyCoRe(tm)-Application.\n").append(
082 " I as a developer of cause know that Xerces is").append(" bundled with every MyCoRe(tm) release and thus").append(
083 " you will never read this message.\n").append(" But just in case, I thought it is a good idea to").append(" implement this message here.")
084 .toString());
085 }
086 }
087
088 /*
089 * (non-Javadoc)
090 *
091 * @see org.mycore.services.plugins.TextFilterPlugin#getName()
092 */
093 abstract public String getName();
094
095 /*
096 * (non-Javadoc)
097 *
098 * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
099 */
100 abstract public String getInfo();
101
102 /*
103 * (non-Javadoc)
104 *
105 * @see org.mycore.services.plugins.TextFilterPlugin#getSupportedContentTypes()
106 */
107 public HashSet getSupportedContentTypes() {
108 return contentTypes;
109 }
110
111 /*
112 * (non-Javadoc)
113 *
114 * @see org.mycore.services.plugins.TextFilterPlugin#transform(org.mycore.datamodel.ifs.MCRFileContentType,org.mycore.datamodel.ifs.MCRContentInputStream,
115 * java.io.OutputStream)
116 */
117 public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
118 if (getSupportedContentTypes().contains(ct)) {
119 try {
120 System.out.println("Reading "+getDocumentName());
121
122 return getTextReader(getXMLStream(input));
123 } catch (SAXException e) {
124 throw new FilterPluginTransformException("Error while parsing "+getDocumentName(), e);
125 } catch (IOException e) {
126 throw new FilterPluginTransformException("Error while parsing "+getDocumentName(), e);
127 }
128 }
129 throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
130 }
131
132 /**
133 * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
134 */
135 abstract public int getMajorNumber();
136
137 /**
138 * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
139 */
140 abstract public int getMinorNumber();
141
142 static InputStream getXMLStream(InputStream inp) throws IOException {
143 ZipInputStream zip = new ZipInputStream(inp);
144 ZipEntry ze;
145
146 // search for "content.xml" in ZipStream
147 while ((ze = zip.getNextEntry()) != null) {
148 if (ze.getName().equals("content.xml")) {
149 break;
150 }
151 }
152
153 if ((ze == null) || !ze.getName().equals("content.xml")) {
154 throw new FilterPluginTransformException("No content.xml was found in OpenOffice.org document!");
155 }
156
157 int chunkSize = (ze.getSize() < 0) ? DEF_BYTE_SZ : (int) ze.getSize();
158 ByteArrayOutputStream bos = new ByteArrayOutputStream(chunkSize);
159 BufferedOutputStream out = new BufferedOutputStream(bos);
160 byte[] ba = new byte[chunkSize];
161
162 while (true) {
163 int bytesRead = MCRUtils.readBlocking(zip, ba, 0, chunkSize);
164
165 if (bytesRead > 0) {
166 out.write(ba, 0 /* offset in ba */, bytesRead /*
167 * bytes to
168 * write
169 */);
170 } else {
171 break; // hit eof
172 }
173 }
174
175 out.close();
176
177 return new ByteArrayInputStream(bos.toByteArray());
178 }
179
180 private Reader getTextReader(InputStream xml) throws IOException, SAXException {
181 XMLReader reader = XMLReaderFactory.createXMLReader(SAXparser);
182 StringBuilder buf = new StringBuilder();
183 reader.setContentHandler(new TextHandler(buf,getTextNameSpace()));
184
185 InputSource inp = new InputSource(xml);
186 reader.setEntityResolver(OooResolver);
187 reader.parse(inp);
188
189 return new StringBuilderReader(buf);
190 }
191
192 abstract String getTextNameSpace();
193 abstract String getDocumentName();
194
195 private static class StringBuilderReader extends Reader {
196 private final StringBuilder buf;
197
198 private int pos;
199
200 public StringBuilderReader(StringBuilder buf) {
201 this.buf = buf;
202 pos = 0;
203 }
204
205 /*
206 * (non-Javadoc)
207 *
208 * @see java.io.Reader#close()
209 */
210 public void close() {
211 }
212
213 /*
214 * (non-Javadoc)
215 *
216 * @see java.io.Reader#read(char[], int, int)
217 */
218 public int read(char[] cbuf, int off, int len) {
219 if (pos == buf.length()) {
220 return -1;
221 }
222 int start = pos + off;
223 int charsRead = (buf.length() < (start + len)) ? (buf.length() - start) : len;
224 int end = start + charsRead;
225 buf.getChars(start, end, cbuf, 0);
226 pos = end;
227
228 return charsRead;
229 }
230 }
231
232 private static class TextHandler extends DefaultHandler {
233 private String textNS;
234
235 private final StringBuilder buf;
236
237 private boolean textElement = false;
238
239 TextHandler(StringBuilder buf,String textNameSpace) {
240 this.buf = buf;
241 this.textNS= textNameSpace;
242 }
243
244 /*
245 * (non-Javadoc)
246 *
247 * @see org.xml.sax.ContentHandler#characters(char[], int, int)
248 */
249 public void characters(char[] ch, int start, int length) {
250 if (textElement) {
251 // write text to the stream
252 buf.append(ch, start, length).append(' ');
253 }
254 }
255
256 /*
257 * (non-Javadoc)
258 *
259 * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
260 * java.lang.String, java.lang.String, org.xml.sax.Attributes)
261 */
262 public void startElement(String uri, String localName, String qName, Attributes attributes) {
263 // using internal optimized Strings of Xerces-J
264 if (uri == textNS) {
265 textElement = true;
266 } else if (uri.equals(textNS)) {
267 textElement = true;
268
269 // therefor we might need to assign a given uri to textNS
270 textNS = uri;
271 } else {
272 textElement = false;
273 }
274 }
275 }
276
277 private static class ResolveOfficeDTD implements EntityResolver {
278 /**
279 * returns an empty dtd to parse Ooo documents we don't need them, since
280 * we validate them either
281 */
282 public InputSource resolveEntity(String publicId, String systemId) {
283 return new InputSource(new StringReader(" "));
284 }
285 }
286 }