View Javadoc
1   /*
2    * This file is part of ***  M y C o R e  ***
3    * See http://www.mycore.de/ for details.
4    *
5    * MyCoRe is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * MyCoRe is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with MyCoRe.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  
19  package org.mycore.frontend.indexbrowser;
20  
21  import java.io.File;
22  import java.io.IOException;
23  import java.nio.file.NotDirectoryException;
24  import java.text.NumberFormat;
25  import java.text.SimpleDateFormat;
26  import java.util.ArrayList;
27  import java.util.Date;
28  import java.util.GregorianCalendar;
29  import java.util.List;
30  import java.util.Locale;
31  import java.util.Objects;
32  import java.util.TimeZone;
33  import java.util.stream.Collectors;
34  
35  import org.apache.logging.log4j.LogManager;
36  import org.apache.logging.log4j.Logger;
37  import org.apache.solr.client.solrj.SolrQuery;
38  import org.apache.solr.client.solrj.SolrServerException;
39  import org.apache.solr.client.solrj.response.QueryResponse;
40  import org.jdom2.Document;
41  import org.jdom2.Element;
42  import org.jdom2.Namespace;
43  import org.mycore.common.config.MCRConfiguration2;
44  import org.mycore.datamodel.common.MCRObjectIDDate;
45  import org.mycore.datamodel.ifs2.MCRObjectIDDateImpl;
46  import org.mycore.solr.MCRSolrClientFactory;
47  
48  /**
49   * This class implements all common methods to create the sitemap data.
50   * <br>
51   * used properties:
52   * <br>
53   * <ul>
54   * <li>MCR.baseurl - the application base URL</li>
55   * <li>MCR.WebApplication.basedir - the directory where the web application is stored</li>
56   * <li>MCR.GoogleSitemap.Directory - the directory where the sitemap should be stored relative to
57   *      MCR.WebApplication.basedir (it could be empty)</li>
58   * <li>MCR.GoogleSitemap.Types - a list of MCRObject types, they should be included</li>
59   * <li>MCR.GoogleSitemap.Freq - the frequency of harvesting, 'monthly' is default<li>
60   * <li>MCR.GoogleSitemap.Style - a style extension for the URL in form of ?XSL.Style={style}, default is empty</li>
61   * <li>MCR.GoogleSitemap.ObjectPath - the path to get the MCRObject in the sitemap URL, 'receive/' is default</li>
62   * <li>MCR.GoogleSitemap.NumberOfURLs - the number of URLs in one sitemap file, 10000 is default</li>
63   * </ul>
64   *
65   * see http://www.sitemaps.org/de/protocol.html
66   *
67   * @author Frank Lützenkirchen
68   * @author Jens Kupferschmidt
69   * @author Thomas Scheffler (yagee)
70   * @version $Revision$ $Date$
71   *
72   */
73  public final class MCRGoogleSitemapCommon {
74  
75      /** Zone information **/
76      private static final Locale SITEMAP_LOCALE = Locale.ROOT;
77  
78      private static final TimeZone SITEMAP_TIMEZONE = TimeZone.getTimeZone("UTC");
79  
80      /** The namespaces */
81      private static final Namespace NS = Namespace.getNamespace("http://www.sitemaps.org/schemas/sitemap/0.9");
82  
83      private static final String XSI_URL = "http://www.w3.org/2001/XMLSchema-instance";
84  
85      private static final Namespace XSI_NAMESPACE = Namespace.getNamespace("xsi", XSI_URL);
86  
87      private static final String SITEINDEX_SCHEMA
88          = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd";
89  
90      private static final String SITEMAP_SCHEMA
91          = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd";
92  
93      /** The directory path to store sitemaps relative to MCR.WebApplication.basedir */
94      private static final String CDIR = MCRConfiguration2.getString("MCR.GoogleSitemap.Directory").orElse("");
95  
96      /** The frequence of crawle by Google */
97      private static final String FREQ = MCRConfiguration2.getString("MCR.GoogleSitemap.Freq").orElse("monthly");
98  
99      /** The style for by Google link */
100     private static final String STYLE = MCRConfiguration2.getString("MCR.GoogleSitemap.Style").orElse("");
101 
102     /** The url path for retrieving object metadata */
103     private static final String OBJECT_PATH = MCRConfiguration2.getString("MCR.GoogleSitemap.ObjectPath")
104         .orElse("receive/");
105 
106     /** The filter query for selecting objects to present in google sitemap */
107     private static final String SOLR_QUERY = MCRConfiguration2.getStringOrThrow("MCR.GoogleSitemap.SolrQuery");
108 
109     /** The logger */
110     private static Logger LOGGER = LogManager.getLogger(MCRGoogleSitemapCommon.class.getName());
111 
112     /** Number of URLs in one sitemap */
113     private int numberOfURLs = MCRConfiguration2.getInt("MCR.GoogleSitemap.NumberOfURLs").orElse(10000);
114 
115     /** number format for parts */
116     private static NumberFormat number_format = getNumberFormat();
117 
118     /** date formatter */
119     private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd", SITEMAP_LOCALE);
120 
121     /** The webapps directory path from configuration */
122     private final File webappBaseDir;
123 
124     /** The base URL */
125     private String baseurl = MCRConfiguration2.getString("MCR.baseurl").orElse("");
126 
127     /** local data */
128     private List<MCRObjectIDDate> objidlist = null;
129 
130     /** The constructor 
131      * @throws NotDirectoryException */
132     public MCRGoogleSitemapCommon(File baseDir) throws NotDirectoryException {
133         if (!Objects.requireNonNull(baseDir, "baseDir may not be null.").isDirectory()) {
134             throw new NotDirectoryException(baseDir.getAbsolutePath());
135         }
136         this.webappBaseDir = baseDir;
137         LOGGER.info("Using webappbaseDir: {}", baseDir.getAbsolutePath());
138         objidlist = new ArrayList<>();
139         if ((numberOfURLs < 1) || (numberOfURLs > 50000)) {
140             numberOfURLs = 50000;
141         }
142         if (CDIR.length() != 0) {
143             File sitemapDirectory = new File(webappBaseDir, CDIR);
144             if (!sitemapDirectory.exists()) {
145                 sitemapDirectory.mkdirs();
146             }
147         }
148     }
149 
150     public MCRGoogleSitemapCommon(String baseURL, File baseDir) throws NotDirectoryException {
151         this(baseDir);
152         this.baseurl = baseURL;
153     }
154 
155     private static NumberFormat getNumberFormat() {
156         NumberFormat nf = NumberFormat.getIntegerInstance(SITEMAP_LOCALE);
157         nf.setMinimumFractionDigits(5);
158         return nf;
159     }
160 
161     /**
162      * The method computes the number of sitemap files. If we have less than
163      * <em>numberOfURLs</em> URLs and only one MyCoRe type the sitemap_google.xml
164      * contained all URLs. Otherwise it split the sitemap in an sitemap_google.xml
165      * index file and a lot of sitemap_google_xxxx.xml URL files.
166      *
167      * @return the number of files, one for a single sitemap_google.xml file, more than
168      *         one for the index and all parts.
169      *
170      */
171     protected int checkSitemapFile() throws IOException {
172         int number = 0;
173         QueryResponse response;
174         SolrQuery query = new SolrQuery();
175         query.setQuery(SOLR_QUERY);
176         query.setRows(Integer.MAX_VALUE);
177         query.setParam("fl", "id,modified");
178 
179         try {
180             response = MCRSolrClientFactory.getMainSolrClient().query(query);
181             objidlist = response.getResults().stream().map((document) -> {
182                 String id = (String) document.getFieldValue("id");
183                 Date modified = (Date) document.getFieldValue("modified");
184 
185                 return new MCRObjectIDDateImpl(modified, id);
186             }).collect(Collectors.toList());
187 
188         } catch (SolrServerException e) {
189             LOGGER.error(e);
190         }
191         number = objidlist.size() / numberOfURLs;
192         if (objidlist.size() % numberOfURLs != 0) {
193             number++;
194         }
195         return number;
196     }
197 
198     /**
199      * The method return the path to the sitemap_google.xml file.
200      *
201      * @param number
202      *            number of this file - '1' = sitemap_google.xml - '&gt; 1' sitemap_google_xxx.xml
203      * @param withpath
204      *            true for the full path, false for the file name
205      * @return a path to sitemap_google.xml
206      */
207     protected String getFileName(int number, boolean withpath) {
208         String fn = "sitemap_google.xml";
209         if (number > 1) {
210             fn = "sitemap_google_" + number_format.format(number - 1) + ".xml";
211         }
212         String localPath = fn;
213         if (CDIR.length() != 0) {
214             localPath = CDIR + File.separator + fn;
215         }
216         if (withpath) {
217             return webappBaseDir + File.separator + localPath;
218         }
219         return localPath;
220     }
221 
222     /**
223      * The method build the sitemap_google.xml JDOM document over all items.
224      *
225      * @return The sitemap_google.xml as JDOM document
226      */
227     protected Document buildSingleSitemap() throws Exception {
228         LOGGER.debug("Build Google URL sitemap_google.xml for whole items.");
229         // build document frame
230         Element urlset = new Element("urlset", NS);
231         urlset.addNamespaceDeclaration(XSI_NAMESPACE);
232         urlset.setAttribute("noNamespaceSchemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
233         Document jdom = new Document(urlset);
234         // build over all types
235         for (MCRObjectIDDate objectIDDate : objidlist) {
236             urlset.addContent(buildURLElement(objectIDDate));
237         }
238         return jdom;
239     }
240 
241     /**
242      * The method call the database and build the sitemap_google.xml JDOM document.
243      *
244      * @param number
245      *            number of this file - '1' = sitemap_google.xml - '&gt; 1' sitemap_google_xxx.xml
246      * @return The sitemap.xml as JDOM document
247      */
248     protected Document buildPartSitemap(int number) throws Exception {
249         LOGGER.debug("Build Google URL sitemap list number {}", Integer.toString(number));
250         // build document frame
251         Element urlset = new Element("urlset", NS);
252         urlset.addNamespaceDeclaration(XSI_NAMESPACE);
253         urlset.setAttribute("schemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
254         Document jdom = new Document(urlset);
255 
256         // build over all types
257         int start = numberOfURLs * (number);
258         int stop = numberOfURLs * (number + 1);
259         if (stop > objidlist.size()) {
260             stop = objidlist.size();
261         }
262         LOGGER.debug("Build Google URL in range from {} to {}.", Integer.toString(start), Integer.toString(stop - 1));
263         for (int i = start; i < stop; i++) {
264             MCRObjectIDDate objectIDDate = objidlist.get(i);
265             urlset.addContent(buildURLElement(objectIDDate));
266 
267         }
268         return jdom;
269     }
270 
271     private Element buildURLElement(MCRObjectIDDate objectIDDate) {
272         String mcrID = objectIDDate.getId();
273         StringBuilder sb = new StringBuilder(1024);
274         sb.append(baseurl).append(OBJECT_PATH).append(mcrID);
275         if ((STYLE != null) && (STYLE.trim().length() > 0)) {
276             sb.append("?XSL.Style=").append(STYLE);
277         }
278         // build entry
279         Element url = new Element("url", NS);
280         url.addContent(new Element("loc", NS).addContent(sb.toString()));
281         String datestr = formatter.format(objectIDDate.getLastModified());
282         url.addContent(new Element("lastmod", NS).addContent(datestr));
283         url.addContent(new Element("changefreq", NS).addContent(FREQ));
284         return url;
285     }
286 
287     /**
288      * The method build the index sitemap_google.xml JDOM document.
289      *
290      * @param number
291      *            number of indexed files (must greater than 1
292      * @return The index sitemap_google.xml as JDOM document
293      */
294     protected Document buildSitemapIndex(int number) {
295         LOGGER.debug("Build Google sitemap number {}", Integer.toString(number));
296         // build document frame
297         Element index = new Element("sitemapindex", NS);
298         index.addNamespaceDeclaration(XSI_NAMESPACE);
299         index.setAttribute("schemaLocation", SITEINDEX_SCHEMA, XSI_NAMESPACE);
300         Document jdom = new Document(index);
301         // build over all files
302         for (int i = 0; i < number; i++) {
303             Element sitemap = new Element("sitemap", NS);
304             index.addContent(sitemap);
305             sitemap.addContent(new Element("loc", NS).addContent((baseurl + getFileName(i + 2, false)).trim()));
306             String datestr = formatter.format((new GregorianCalendar(SITEMAP_TIMEZONE, SITEMAP_LOCALE)).getTime());
307             sitemap.addContent(new Element("lastmod", NS).addContent(datestr.trim()));
308         }
309         return jdom;
310     }
311 
312     /**
313      * This method remove all sitemap files from the webapps directory.
314      */
315     protected void removeSitemapFiles() {
316         File dir = new File(webappBaseDir, CDIR);
317         File[] li = dir.listFiles();
318         if (li != null) {
319             for (File fi : li) {
320                 if (fi.getName().startsWith("sitemap_google")) {
321                     LOGGER.debug("Remove file {}", fi.getName());
322                     fi.delete();
323                 }
324             }
325         }
326     }
327 }