001    /*
002     * 
003     * $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
004     *
005     * This file is part of ***  M y C o R e  ***
006     * See http://www.mycore.de/ for details.
007     *
008     * This program is free software; you can use it, redistribute it
009     * and / or modify it under the terms of the GNU General Public License
010     * (GPL) as published by the Free Software Foundation; either version 2
011     * of the License or (at your option) any later version.
012     *
013     * This program is distributed in the hope that it will be useful, but
014     * WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016     * GNU General Public License for more details.
017     *
018     * You should have received a copy of the GNU General Public License
019     * along with this program, in a file called gpl.txt or license.txt.
020     * If not, write to the Free Software Foundation Inc.,
021     * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
022     */
023    
024    package org.mycore.datamodel.ifs.extractors;
025    
026    import java.io.InputStream;
027    import java.util.Calendar;
028    
029    import org.jdom.Element;
030    import org.mycore.datamodel.metadata.MCRMetaISO8601Date;
031    import org.pdfbox.pdmodel.PDDocument;
032    import org.pdfbox.pdmodel.PDDocumentInformation;
033    import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
034    import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
035    
036    /**
037     * Extracts metadata from PDF files using the PDFBox library. The number of
038     * pages, document information like author and title, and the titles of all
039     * outline items (table of contents) are extracted. See http://www.pdfbox.org/
040     * for details.
041     * 
042     * @author Frank Lützenkirchen
043     * @version $Revision: 13085 $ $Date: 2008-02-06 18:27:24 +0100 (Mi, 06 Feb 2008) $
044     */
045    public class MCRDataExtractorPDF extends MCRDataExtractor {
046    
047        protected String getSupportedContentTypeIDs() {
048            return "jpeg";
049        }
050    
051        protected void extractData(Element container, InputStream in) throws Exception {
052            PDDocument pdf = PDDocument.load(in);
053    
054            // Number of pages
055            addDataValue(container, "numPages", String.valueOf(pdf.getNumberOfPages()));
056    
057            // Document information
058            PDDocumentInformation info = pdf.getDocumentInformation();
059            MCRMetaISO8601Date iso = new MCRMetaISO8601Date();
060            Calendar cal = info.getCreationDate();
061            if (cal != null) {
062                iso.setDate(cal.getTime());
063                addDataValue(container, "created", iso.getISOString());
064            }
065            cal = info.getModificationDate();
066            if (cal != null) {
067                iso.setDate(cal.getTime());
068                addDataValue(container, "modified", iso.getISOString());
069            }
070    
071            addDataValue(container, "author", info.getAuthor());
072            addDataValue(container, "creator", info.getCreator());
073            addDataValue(container, "keywords", info.getKeywords());
074            addDataValue(container, "producer", info.getProducer());
075            addDataValue(container, "subject", info.getSubject());
076            addDataValue(container, "title", info.getTitle());
077    
078            // Document outline
079            PDDocumentOutline root = pdf.getDocumentCatalog().getDocumentOutline();
080            Element outline = new Element("outline");
081            addOutlineItems(outline, root.getFirstChild());
082            if (outline.getChildren().size() > 0)
083                container.addContent(outline);
084    
085            pdf.close();
086        }
087    
088        /**
089         * Extracts the titles of outline items
090         */
091        private void addOutlineItems(Element parent, PDOutlineItem item) {
092            while (item != null) {
093                Element xItem = new Element("item");
094                xItem.setAttribute("title", item.getTitle());
095                parent.addContent(xItem);
096                addOutlineItems(xItem, item.getFirstChild());
097                item = item.getNextSibling();
098            }
099        }
100    
101        /**
102         * Test application that outputs extracted metadata for a given local file.
103         * 
104         * @param args
105         *            the path to a locally stored PDF file
106         */
107        public static void main(String[] args) {
108            new MCRDataExtractorPDF().testLocalFile(args[0]);
109        }
110    }