View Javadoc
1   /*
2    * This file is part of ***  M y C o R e  ***
3    * See http://www.mycore.de/ for details.
4    *
5    * MyCoRe is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * MyCoRe is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with MyCoRe.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  
19  package org.mycore.mods.rss;
20  
21  import java.io.IOException;
22  import java.net.URL;
23  import java.text.MessageFormat;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Locale;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  import java.util.regex.PatternSyntaxException;
31  
32  import org.apache.logging.log4j.LogManager;
33  import org.apache.logging.log4j.Logger;
34  import org.apache.solr.client.solrj.SolrClient;
35  import org.apache.solr.client.solrj.SolrQuery;
36  import org.apache.solr.common.SolrDocumentList;
37  import org.jdom2.Document;
38  import org.jdom2.Element;
39  import org.jdom2.filter.ElementFilter;
40  import org.mycore.access.MCRAccessException;
41  import org.mycore.common.MCRConstants;
42  import org.mycore.common.MCRException;
43  import org.mycore.common.MCRMailer;
44  import org.mycore.common.MCRPersistenceException;
45  import org.mycore.common.config.MCRConfiguration2;
46  import org.mycore.common.config.MCRConfigurationException;
47  import org.mycore.common.xml.MCRURIResolver;
48  import org.mycore.datamodel.metadata.MCRMetadataManager;
49  import org.mycore.datamodel.metadata.MCRObject;
50  import org.mycore.datamodel.metadata.MCRObjectID;
51  import org.mycore.mods.MCRMODSWrapper;
52  import org.mycore.solr.MCRSolrClientFactory;
53  import org.mycore.solr.MCRSolrUtils;
54  
55  import com.rometools.rome.feed.synd.SyndEntry;
56  import com.rometools.rome.feed.synd.SyndFeed;
57  import com.rometools.rome.io.FeedException;
58  import com.rometools.rome.io.SyndFeedInput;
59  import com.rometools.rome.io.XmlReader;
60  
61  /**
62   * Reads an RSS feed referencing new publications and imports those publications that are not stored yet.
63   *
64   * Usage:
65   *   MCRRSSFeedImporter.importFromFeed( [sourceSystemID], [targetProjectID] );
66   *   where targetProjectID is the target project ID to import mods objects to, e.g. "mir".
67   *
68   * Reads the RSS feed configured via
69   * MCR.MODS.RSSFeedImporter.[sourceSystemID].FeedURL=[http(s) URL of remote RSS feed to read]
70   *
71   * For each entry,
72   *
73   * 1. Gets the link given in that entry (assuming it points to the publications) and
74   * extracts the publication ID from the link, using a regular expression configured via
75   * MCR.MODS.RSSFeedImporter.[sourceSystemID].Pattern2FindID=
76   *
77   * 2. Queries the SOLR index to check if this publication isn't already stored. The field to query is
78   * MCR.MODS.RSSFeedImporter.[sourceSystemID].Field2QueryID=[SOLR field name]
79   *
80   * 3. Retrieves the publication metadata from the remote system and converts it to &lt;mycoreobject /&gt; XML.
81   * MCR.MODS.RSSFeedImporter.[sourceSystemID].PublicationURI=xslStyle:...:http://...{0}...
82   * where the remote publication ID will be replaced in Java message format syntax as {0}.
83   *
84   * 4. Saves the publication in persistent store, with the given projectID and object type "mods".
85   *
86   * When the total number of publications imported is &gt; 0 AND the property
87   * MCR.MODS.RSSFeedImporter.[sourceSystemID].XSL2BuildNotificationMail=foo.xsl
88   * is set, builds and sends a notification mail via MCRMailer.
89   *
90   * @author Frank L\u00FCtzenkirchen
91   */
92  public class MCRRSSFeedImporter {
93  
94      private String sourceSystemID;
95  
96      private String feedURL;
97  
98      private Pattern pattern2findID;
99  
100     private String field2queryID;
101 
102     private String importURI;
103 
104     private String xsl2BuildNotificationMail;
105 
106     private static final String STATUS_FLAG = "imported";
107 
108     private static final String PROPERTY_MAIL_ADDRESS = "MCR.Mail.Address";
109 
110     private static final Logger LOGGER = LogManager.getLogger(MCRRSSFeedImporter.class);
111 
112     public static void importFromFeed(String sourceSystemID, String projectID) throws Exception {
113         MCRRSSFeedImporter importer = new MCRRSSFeedImporter(sourceSystemID);
114         importer.importPublications(projectID);
115     }
116 
117     public MCRRSSFeedImporter(String sourceSystemID) {
118         this.sourceSystemID = sourceSystemID;
119 
120         String prefix = "MCR.MODS.RSSFeedImporter." + sourceSystemID + ".";
121 
122         feedURL = MCRConfiguration2.getStringOrThrow(prefix + "FeedURL");
123         importURI = MCRConfiguration2.getStringOrThrow(prefix + "PublicationURI");
124         field2queryID = MCRConfiguration2.getStringOrThrow(prefix + "Field2QueryID");
125         xsl2BuildNotificationMail = MCRConfiguration2.getString(prefix + "XSL2BuildNotificationMail").orElse(null);
126 
127         getPattern2FindID(prefix);
128     }
129 
130     private void getPattern2FindID(String prefix) {
131         String patternProperty = prefix + "Pattern2FindID";
132         try {
133             String pattern = MCRConfiguration2.getStringOrThrow(patternProperty);
134             pattern2findID = Pattern.compile(pattern);
135         } catch (PatternSyntaxException ex) {
136             String msg = "Regular expression syntax error: " + patternProperty;
137             throw new MCRConfigurationException(msg, ex);
138         }
139     }
140 
141     public void importPublications(String projectID) throws Exception {
142         LOGGER.info("Getting new publications from {} RSS feed...", sourceSystemID);
143         SyndFeed feed = retrieveFeed();
144 
145         List<MCRObject> importedObjects = new ArrayList<>();
146         for (SyndEntry entry : feed.getEntries()) {
147             MCRObject importedObject = handleFeedEntry(entry, projectID);
148             if (importedObject != null) {
149                 importedObjects.add(importedObject);
150             }
151         }
152 
153         int numPublicationsImported = importedObjects.size();
154         LOGGER.info("imported {} publications.", numPublicationsImported);
155 
156         if ((numPublicationsImported > 0) && (xsl2BuildNotificationMail != null)) {
157             sendNotificationMail(importedObjects);
158         }
159     }
160 
161     private SyndFeed retrieveFeed() throws IOException, FeedException {
162         XmlReader feedReader = new XmlReader(new URL(feedURL));
163         SyndFeedInput input = new SyndFeedInput();
164         return input.build(feedReader);
165     }
166 
167     private MCRObject handleFeedEntry(SyndEntry entry, String projectID)
168         throws MCRPersistenceException, MCRAccessException {
169         String publicationID = getPublicationID(entry);
170         if (publicationID == null) {
171             return null;
172         }
173 
174         if (isAlreadyStored(publicationID)) {
175             LOGGER.info("publication with ID {} already existing, will not import.", publicationID);
176             return null;
177         }
178 
179         LOGGER.info("publication with ID {} does not exist yet, retrieving data...", publicationID);
180         Element publicationXML = retrieveAndConvertPublication(publicationID);
181         if (shouldIgnore(publicationXML)) {
182             LOGGER.info("publication will be ignored, do not store.");
183             return null;
184         }
185 
186         MCRObject obj = buildMCRObject(publicationXML, projectID);
187         MCRMetadataManager.create(obj);
188         return obj;
189     }
190 
191     private String getPublicationID(SyndEntry entry) {
192         String link = entry.getLink();
193         if (link == null) {
194             LOGGER.warn("no link found in feed entry");
195             return null;
196         }
197         link = link.trim();
198         Matcher m = pattern2findID.matcher(link);
199         if (m.matches()) {
200             return m.group(1);
201         } else {
202             LOGGER.warn("no publication ID found in link {}", link);
203             return null;
204         }
205     }
206 
207     private boolean isAlreadyStored(String publicationID) {
208         SolrClient solrClient = MCRSolrClientFactory.getMainSolrClient();
209         SolrQuery query = new SolrQuery();
210         query.setQuery(field2queryID + ":" + MCRSolrUtils.escapeSearchValue(publicationID));
211         query.setRows(0);
212         SolrDocumentList results;
213         try {
214             results = solrClient.query(query).getResults();
215             return (results.getNumFound() > 0);
216         } catch (Exception ex) {
217             throw new MCRException(ex);
218         }
219     }
220 
221     private Element retrieveAndConvertPublication(String externalID) {
222         String uri = new MessageFormat(importURI, Locale.ROOT).format(new String[] { externalID });
223         return MCRURIResolver.instance().resolve(uri);
224     }
225 
226     /** If mods:genre was not mapped by conversion/import function, ignore this publication and do not import */
227     private static boolean shouldIgnore(Element publication) {
228         return !publication.getDescendants(new ElementFilter("genre", MCRConstants.MODS_NAMESPACE)).hasNext();
229     }
230 
231     private MCRObject buildMCRObject(Element publicationXML, String projectID) {
232         MCRObject obj = new MCRObject(new Document(publicationXML));
233         MCRMODSWrapper wrapper = new MCRMODSWrapper(obj);
234         wrapper.setServiceFlag("status", STATUS_FLAG);
235         MCRObjectID oid = MCRObjectID.getNextFreeId(projectID, "mods");
236         obj.setId(oid);
237         return obj;
238     }
239 
240     private void sendNotificationMail(List<MCRObject> importedObjects) throws Exception {
241         Element xml = new Element(STATUS_FLAG).setAttribute("source", this.sourceSystemID);
242         for (MCRObject obj : importedObjects) {
243             xml.addContent(obj.createXML().detachRootElement());
244         }
245 
246         HashMap<String, String> parameters = new HashMap<>();
247         parameters.put(PROPERTY_MAIL_ADDRESS, MCRConfiguration2.getStringOrThrow(PROPERTY_MAIL_ADDRESS));
248         MCRMailer.sendMail(new Document(xml), xsl2BuildNotificationMail, parameters);
249     }
250 }