001    /*
002     * 
003     * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.services.plugins;
025    
026    import java.io.BufferedOutputStream;
027    import java.io.BufferedReader;
028    import java.io.File;
029    import java.io.FileInputStream;
030    import java.io.FileNotFoundException;
031    import java.io.FileOutputStream;
032    import java.io.IOException;
033    import java.io.InputStream;
034    import java.io.InputStreamReader;
035    import java.io.Reader;
036    import java.util.HashSet;
037    
038    import org.mycore.common.MCRConfigurationException;
039    import org.mycore.common.MCRUtils;
040    import org.mycore.datamodel.ifs.MCRFileContentType;
041    import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
042    
043    /**
044     * Provide some info about your class!
045     * 
046     * @author Thomas Scheffler (yagee)
047     */
048    public class PdfPlugin implements TextFilterPlugin {
049        private static HashSet contentTypes = null;
050    
051        private static String name = "Yagee's amazing PDF Filter";
052    
053        private static final int MAJOR = 0;
054    
055        private static final int MINOR = 7;
056    
057        private static String info = null;
058    
059        private static String p2t_info = null;
060    
061        private static final String textencoding = "UTF-8";
062    
063        /**
064         * 
065         */
066        public PdfPlugin() {
067            super();
068    
069            if (contentTypes == null) {
070                contentTypes = new HashSet();
071    
072                if (MCRFileContentTypeFactory.isTypeAvailable("pdf")) {
073                    contentTypes.add(MCRFileContentTypeFactory.getType("pdf"));
074                }
075            }
076    
077            if ((p2t_info == null) && !pdftotext()) {
078                throw new FilterPluginInstantiationException(new StringBuffer("The execution of \"pdftotext\" failed.").append("Maybe it's not installed or in your search path!\n").append("To use this Plugin you have to install XPdf").append("http://www.foolabs.com/xpdf/) and ensure ").append("the pdftotext binary is in your search path.\n").append(
079                        "Another reason maybe that you are using a version that").append(" is not compatible with this Plugin:\n").append(getName()).append(" v").append(MAJOR).append('.').append(MINOR).toString());
080            }
081    
082            if (info == null) {
083                info = new StringBuffer("This filter uses XPDF for transformation.").append("\nSource code is available on http://www.foolabs.com/xpdf/").append("\nCurrently using: ").append(p2t_info).toString();
084            }
085        }
086    
087        /*
088         * (non-Javadoc)
089         * 
090         * @see org.mycore.services.plugins.TextFilterPlugin#getName()
091         */
092        public String getName() {
093            return name;
094        }
095    
096        /*
097         * (non-Javadoc)
098         * 
099         * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
100         */
101        public String getInfo() {
102            return info;
103        }
104    
105        private boolean pdftotext() {
106            int rc;
107            final String[] testcommand = { "pdftotext", "-v" };
108            String s;
109            StringBuffer infofetch = new StringBuffer();
110    
111            try {
112                Process p = Runtime.getRuntime().exec(testcommand);
113                BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
114    
115                while ((s = stdError.readLine()) != null) {
116                    infofetch.append(s).append(", ");
117                }
118    
119                rc = p.waitFor();
120                p2t_info = infofetch.deleteCharAt(infofetch.length() - 2).toString();
121            } catch (IOException e) {
122                if (e.getMessage().indexOf("not found") > 0) {
123                    //NOTE: It is a ugly pain to parse a error message, but at worst we throw the wrong error message
124                    throw new FilterPluginInstantiationException(new StringBuffer(testcommand[0]).append(" is not installed or in search path!\n").append("To use this Plugin you have to install XPdf").append("http://www.foolabs.com/xpdf/) and ensure ").append("the pdftotext binary is in your search path.").toString(), e);
125                }
126                throw new FilterPluginInstantiationException("Error while excuting " + testcommand, e);
127            } catch (InterruptedException e) {
128                throw new FilterPluginInstantiationException("Error while excuting " + testcommand, e);
129            }
130    
131            return (rc == 99);
132        }
133    
134        private boolean pdftotext(File pdffile, File txtfile) {
135            int rc;
136            final String[] testcommand = { "pdftotext", "-enc", textencoding, "-raw", pdffile.getAbsolutePath(), txtfile.getAbsolutePath() };
137            String s;
138    
139            try {
140                StringBuffer sb = new StringBuffer();
141    
142                for (int i = 0; i < testcommand.length; i++) {
143                    sb.append(testcommand[i]).append(' ');
144                }
145    
146                System.err.println(sb);
147    
148                Process p = Runtime.getRuntime().exec(testcommand);
149                BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
150    
151                while ((s = stdError.readLine()) != null) {
152                    System.err.println(s);
153                }
154    
155                rc = p.waitFor();
156            } catch (IOException e) {
157                if (e.getMessage().indexOf("not found") > 0) {
158                    //NOTE: It is a ugly pain to parse a error message, but at worst we throw the wrong error message
159                    throw new MCRConfigurationException(testcommand[0] + " is not installed or in search path!", e);
160                }
161                throw new MCRConfigurationException("Error while excuting " + testcommand, e);
162            } catch (InterruptedException e) {
163                throw new MCRConfigurationException("Error while excuting " + testcommand, e);
164            }
165    
166            return (rc == 00);
167        }
168    
169        /*
170         * (non-Javadoc)
171         * 
172         * @see org.mycore.services.plugins.TextFilterPlugin#getSupportedContentTypes()
173         */
174        public HashSet getSupportedContentTypes() {
175            return contentTypes;
176        }
177    
178        public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
179            if (!getSupportedContentTypes().contains(ct)) {
180                throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
181            }
182    
183            try {
184                System.err.println("===== PDF decoding starts ====");
185    
186                File pdffile = File.createTempFile("inp", ".pdf");
187                BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(pdffile));
188                pdffile.deleteOnExit();
189                MCRUtils.copyStream(input, out);
190                out.close();
191    
192                File txtfile = File.createTempFile("out", ".txt");
193                txtfile.deleteOnExit();
194    
195                if (!pdftotext(pdffile, txtfile)) {
196                    throw new FilterPluginTransformException("pdftotext reported an error while exporting text of PDF file!");
197                }
198    
199                pdffile.delete();
200    
201                FileInputStream fin = new FileInputStream(txtfile);
202    
203                return new InputStreamReader(fin, textencoding);
204            } catch (FileNotFoundException e) {
205                throw new FilterPluginTransformException("File was not found!", e);
206            } catch (IOException e) {
207                throw new FilterPluginTransformException("General I/O Exception occured", e);
208            }
209        }
210    
211        /**
212         * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
213         */
214        public int getMajorNumber() {
215            return MAJOR;
216        }
217    
218        /**
219         * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
220         */
221        public int getMinorNumber() {
222            return MINOR;
223        }
224    }