001 /*
002 *
003 * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.services.plugins;
025
026 import java.io.BufferedOutputStream;
027 import java.io.BufferedReader;
028 import java.io.File;
029 import java.io.FileInputStream;
030 import java.io.FileNotFoundException;
031 import java.io.FileOutputStream;
032 import java.io.IOException;
033 import java.io.InputStream;
034 import java.io.InputStreamReader;
035 import java.io.Reader;
036 import java.util.HashSet;
037
038 import org.mycore.common.MCRConfigurationException;
039 import org.mycore.common.MCRUtils;
040 import org.mycore.datamodel.ifs.MCRFileContentType;
041 import org.mycore.datamodel.ifs.MCRFileContentTypeFactory;
042
043 /**
044 * Provide some info about your class!
045 *
046 * @author Thomas Scheffler (yagee)
047 */
048 public class PdfPlugin implements TextFilterPlugin {
049 private static HashSet contentTypes = null;
050
051 private static String name = "Yagee's amazing PDF Filter";
052
053 private static final int MAJOR = 0;
054
055 private static final int MINOR = 7;
056
057 private static String info = null;
058
059 private static String p2t_info = null;
060
061 private static final String textencoding = "UTF-8";
062
063 /**
064 *
065 */
066 public PdfPlugin() {
067 super();
068
069 if (contentTypes == null) {
070 contentTypes = new HashSet();
071
072 if (MCRFileContentTypeFactory.isTypeAvailable("pdf")) {
073 contentTypes.add(MCRFileContentTypeFactory.getType("pdf"));
074 }
075 }
076
077 if ((p2t_info == null) && !pdftotext()) {
078 throw new FilterPluginInstantiationException(new StringBuffer("The execution of \"pdftotext\" failed.").append("Maybe it's not installed or in your search path!\n").append("To use this Plugin you have to install XPdf").append("http://www.foolabs.com/xpdf/) and ensure ").append("the pdftotext binary is in your search path.\n").append(
079 "Another reason maybe that you are using a version that").append(" is not compatible with this Plugin:\n").append(getName()).append(" v").append(MAJOR).append('.').append(MINOR).toString());
080 }
081
082 if (info == null) {
083 info = new StringBuffer("This filter uses XPDF for transformation.").append("\nSource code is available on http://www.foolabs.com/xpdf/").append("\nCurrently using: ").append(p2t_info).toString();
084 }
085 }
086
087 /*
088 * (non-Javadoc)
089 *
090 * @see org.mycore.services.plugins.TextFilterPlugin#getName()
091 */
092 public String getName() {
093 return name;
094 }
095
096 /*
097 * (non-Javadoc)
098 *
099 * @see org.mycore.services.plugins.TextFilterPlugin#getInfo()
100 */
101 public String getInfo() {
102 return info;
103 }
104
105 private boolean pdftotext() {
106 int rc;
107 final String[] testcommand = { "pdftotext", "-v" };
108 String s;
109 StringBuffer infofetch = new StringBuffer();
110
111 try {
112 Process p = Runtime.getRuntime().exec(testcommand);
113 BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
114
115 while ((s = stdError.readLine()) != null) {
116 infofetch.append(s).append(", ");
117 }
118
119 rc = p.waitFor();
120 p2t_info = infofetch.deleteCharAt(infofetch.length() - 2).toString();
121 } catch (IOException e) {
122 if (e.getMessage().indexOf("not found") > 0) {
123 //NOTE: It is a ugly pain to parse a error message, but at worst we throw the wrong error message
124 throw new FilterPluginInstantiationException(new StringBuffer(testcommand[0]).append(" is not installed or in search path!\n").append("To use this Plugin you have to install XPdf").append("http://www.foolabs.com/xpdf/) and ensure ").append("the pdftotext binary is in your search path.").toString(), e);
125 }
126 throw new FilterPluginInstantiationException("Error while excuting " + testcommand, e);
127 } catch (InterruptedException e) {
128 throw new FilterPluginInstantiationException("Error while excuting " + testcommand, e);
129 }
130
131 return (rc == 99);
132 }
133
134 private boolean pdftotext(File pdffile, File txtfile) {
135 int rc;
136 final String[] testcommand = { "pdftotext", "-enc", textencoding, "-raw", pdffile.getAbsolutePath(), txtfile.getAbsolutePath() };
137 String s;
138
139 try {
140 StringBuffer sb = new StringBuffer();
141
142 for (int i = 0; i < testcommand.length; i++) {
143 sb.append(testcommand[i]).append(' ');
144 }
145
146 System.err.println(sb);
147
148 Process p = Runtime.getRuntime().exec(testcommand);
149 BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream()));
150
151 while ((s = stdError.readLine()) != null) {
152 System.err.println(s);
153 }
154
155 rc = p.waitFor();
156 } catch (IOException e) {
157 if (e.getMessage().indexOf("not found") > 0) {
158 //NOTE: It is a ugly pain to parse a error message, but at worst we throw the wrong error message
159 throw new MCRConfigurationException(testcommand[0] + " is not installed or in search path!", e);
160 }
161 throw new MCRConfigurationException("Error while excuting " + testcommand, e);
162 } catch (InterruptedException e) {
163 throw new MCRConfigurationException("Error while excuting " + testcommand, e);
164 }
165
166 return (rc == 00);
167 }
168
169 /*
170 * (non-Javadoc)
171 *
172 * @see org.mycore.services.plugins.TextFilterPlugin#getSupportedContentTypes()
173 */
174 public HashSet getSupportedContentTypes() {
175 return contentTypes;
176 }
177
178 public Reader transform(MCRFileContentType ct, InputStream input) throws FilterPluginTransformException {
179 if (!getSupportedContentTypes().contains(ct)) {
180 throw new FilterPluginTransformException("ContentType " + ct + " is not supported by " + getName() + "!");
181 }
182
183 try {
184 System.err.println("===== PDF decoding starts ====");
185
186 File pdffile = File.createTempFile("inp", ".pdf");
187 BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(pdffile));
188 pdffile.deleteOnExit();
189 MCRUtils.copyStream(input, out);
190 out.close();
191
192 File txtfile = File.createTempFile("out", ".txt");
193 txtfile.deleteOnExit();
194
195 if (!pdftotext(pdffile, txtfile)) {
196 throw new FilterPluginTransformException("pdftotext reported an error while exporting text of PDF file!");
197 }
198
199 pdffile.delete();
200
201 FileInputStream fin = new FileInputStream(txtfile);
202
203 return new InputStreamReader(fin, textencoding);
204 } catch (FileNotFoundException e) {
205 throw new FilterPluginTransformException("File was not found!", e);
206 } catch (IOException e) {
207 throw new FilterPluginTransformException("General I/O Exception occured", e);
208 }
209 }
210
211 /**
212 * @see org.mycore.services.plugins.TextFilterPlugin#getMajorNumber()
213 */
214 public int getMajorNumber() {
215 return MAJOR;
216 }
217
218 /**
219 * @see org.mycore.services.plugins.TextFilterPlugin#getMinorNumber()
220 */
221 public int getMinorNumber() {
222 return MINOR;
223 }
224 }