001 /*
002 *
003 * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004 *
005 * This file is part of *** M y C o R e ***
006 * See http://www.mycore.de/ for details.
007 *
008 * This program is free software; you can use it, redistribute it
009 * and / or modify it under the terms of the GNU General Public License
010 * (GPL) as published by the Free Software Foundation; either version 2
011 * of the License or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful, but
014 * WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 * GNU General Public License for more details.
017 *
018 * You should have received a copy of the GNU General Public License
019 * along with this program, in a file called gpl.txt or license.txt.
020 * If not, write to the Free Software Foundation Inc.,
021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA
022 */
023
024 package org.mycore.datamodel.ifs.extractors;
025
026 import java.io.InputStream;
027 import java.util.Calendar;
028
029 import org.jdom.Element;
030 import org.mycore.datamodel.metadata.MCRMetaISO8601Date;
031 import org.pdfbox.pdmodel.PDDocument;
032 import org.pdfbox.pdmodel.PDDocumentInformation;
033 import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
034 import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
035
036 /**
037 * Extracts metadata from PDF files using the PDFBox library. The number of
038 * pages, document information like author and title, and the titles of all
039 * outline items (table of contents) are extracted. See http://www.pdfbox.org/
040 * for details.
041 *
042 * @author Frank Lützenkirchen
043 * @version $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
044 */
045 public class MCRDataExtractorPDF extends MCRDataExtractor {
046
047 protected String getSupportedContentTypeIDs() {
048 return "jpeg";
049 }
050
051 protected void extractData(Element container, InputStream in) throws Exception {
052 PDDocument pdf = PDDocument.load(in);
053
054 // Number of pages
055 addDataValue(container, "numPages", String.valueOf(pdf.getNumberOfPages()));
056
057 // Document information
058 PDDocumentInformation info = pdf.getDocumentInformation();
059 MCRMetaISO8601Date iso = new MCRMetaISO8601Date();
060 Calendar cal = info.getCreationDate();
061 if (cal != null) {
062 iso.setDate(cal.getTime());
063 addDataValue(container, "created", iso.getISOString());
064 }
065 cal = info.getModificationDate();
066 if (cal != null) {
067 iso.setDate(cal.getTime());
068 addDataValue(container, "modified", iso.getISOString());
069 }
070
071 addDataValue(container, "author", info.getAuthor());
072 addDataValue(container, "creator", info.getCreator());
073 addDataValue(container, "keywords", info.getKeywords());
074 addDataValue(container, "producer", info.getProducer());
075 addDataValue(container, "subject", info.getSubject());
076 addDataValue(container, "title", info.getTitle());
077
078 // Document outline
079 PDDocumentOutline root = pdf.getDocumentCatalog().getDocumentOutline();
080 Element outline = new Element("outline");
081 addOutlineItems(outline, root.getFirstChild());
082 if (outline.getChildren().size() > 0)
083 container.addContent(outline);
084
085 pdf.close();
086 }
087
088 /**
089 * Extracts the titles of outline items
090 */
091 private void addOutlineItems(Element parent, PDOutlineItem item) {
092 while (item != null) {
093 Element xItem = new Element("item");
094 xItem.setAttribute("title", item.getTitle());
095 parent.addContent(xItem);
096 addOutlineItems(xItem, item.getFirstChild());
097 item = item.getNextSibling();
098 }
099 }
100
101 /**
102 * Test application that outputs extracted metadata for a given local file.
103 *
104 * @param args
105 * the path to a locally stored PDF file
106 */
107 public static void main(String[] args) {
108 new MCRDataExtractorPDF().testLocalFile(args[0]);
109 }
110 }