1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.mycore.frontend.indexbrowser;
20
21 import java.io.File;
22 import java.io.IOException;
23 import java.nio.file.NotDirectoryException;
24 import java.text.NumberFormat;
25 import java.text.SimpleDateFormat;
26 import java.util.ArrayList;
27 import java.util.Date;
28 import java.util.GregorianCalendar;
29 import java.util.List;
30 import java.util.Locale;
31 import java.util.Objects;
32 import java.util.TimeZone;
33 import java.util.stream.Collectors;
34
35 import org.apache.logging.log4j.LogManager;
36 import org.apache.logging.log4j.Logger;
37 import org.apache.solr.client.solrj.SolrQuery;
38 import org.apache.solr.client.solrj.SolrServerException;
39 import org.apache.solr.client.solrj.response.QueryResponse;
40 import org.jdom2.Document;
41 import org.jdom2.Element;
42 import org.jdom2.Namespace;
43 import org.mycore.common.config.MCRConfiguration2;
44 import org.mycore.datamodel.common.MCRObjectIDDate;
45 import org.mycore.datamodel.ifs2.MCRObjectIDDateImpl;
46 import org.mycore.solr.MCRSolrClientFactory;
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 public final class MCRGoogleSitemapCommon {
74
75
76 private static final Locale SITEMAP_LOCALE = Locale.ROOT;
77
78 private static final TimeZone SITEMAP_TIMEZONE = TimeZone.getTimeZone("UTC");
79
80
81 private static final Namespace NS = Namespace.getNamespace("http://www.sitemaps.org/schemas/sitemap/0.9");
82
83 private static final String XSI_URL = "http://www.w3.org/2001/XMLSchema-instance";
84
85 private static final Namespace XSI_NAMESPACE = Namespace.getNamespace("xsi", XSI_URL);
86
87 private static final String SITEINDEX_SCHEMA
88 = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd";
89
90 private static final String SITEMAP_SCHEMA
91 = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd";
92
93
94 private static final String CDIR = MCRConfiguration2.getString("MCR.GoogleSitemap.Directory").orElse("");
95
96
97 private static final String FREQ = MCRConfiguration2.getString("MCR.GoogleSitemap.Freq").orElse("monthly");
98
99
100 private static final String STYLE = MCRConfiguration2.getString("MCR.GoogleSitemap.Style").orElse("");
101
102
103 private static final String OBJECT_PATH = MCRConfiguration2.getString("MCR.GoogleSitemap.ObjectPath")
104 .orElse("receive/");
105
106
107 private static final String SOLR_QUERY = MCRConfiguration2.getStringOrThrow("MCR.GoogleSitemap.SolrQuery");
108
109
110 private static Logger LOGGER = LogManager.getLogger(MCRGoogleSitemapCommon.class.getName());
111
112
113 private int numberOfURLs = MCRConfiguration2.getInt("MCR.GoogleSitemap.NumberOfURLs").orElse(10000);
114
115
116 private static NumberFormat number_format = getNumberFormat();
117
118
119 private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd", SITEMAP_LOCALE);
120
121
122 private final File webappBaseDir;
123
124
125 private String baseurl = MCRConfiguration2.getString("MCR.baseurl").orElse("");
126
127
128 private List<MCRObjectIDDate> objidlist = null;
129
130
131
132 public MCRGoogleSitemapCommon(File baseDir) throws NotDirectoryException {
133 if (!Objects.requireNonNull(baseDir, "baseDir may not be null.").isDirectory()) {
134 throw new NotDirectoryException(baseDir.getAbsolutePath());
135 }
136 this.webappBaseDir = baseDir;
137 LOGGER.info("Using webappbaseDir: {}", baseDir.getAbsolutePath());
138 objidlist = new ArrayList<>();
139 if ((numberOfURLs < 1) || (numberOfURLs > 50000)) {
140 numberOfURLs = 50000;
141 }
142 if (CDIR.length() != 0) {
143 File sitemapDirectory = new File(webappBaseDir, CDIR);
144 if (!sitemapDirectory.exists()) {
145 sitemapDirectory.mkdirs();
146 }
147 }
148 }
149
150 public MCRGoogleSitemapCommon(String baseURL, File baseDir) throws NotDirectoryException {
151 this(baseDir);
152 this.baseurl = baseURL;
153 }
154
155 private static NumberFormat getNumberFormat() {
156 NumberFormat nf = NumberFormat.getIntegerInstance(SITEMAP_LOCALE);
157 nf.setMinimumFractionDigits(5);
158 return nf;
159 }
160
161
162
163
164
165
166
167
168
169
170
171 protected int checkSitemapFile() throws IOException {
172 int number = 0;
173 QueryResponse response;
174 SolrQuery query = new SolrQuery();
175 query.setQuery(SOLR_QUERY);
176 query.setRows(Integer.MAX_VALUE);
177 query.setParam("fl", "id,modified");
178
179 try {
180 response = MCRSolrClientFactory.getMainSolrClient().query(query);
181 objidlist = response.getResults().stream().map((document) -> {
182 String id = (String) document.getFieldValue("id");
183 Date modified = (Date) document.getFieldValue("modified");
184
185 return new MCRObjectIDDateImpl(modified, id);
186 }).collect(Collectors.toList());
187
188 } catch (SolrServerException e) {
189 LOGGER.error(e);
190 }
191 number = objidlist.size() / numberOfURLs;
192 if (objidlist.size() % numberOfURLs != 0) {
193 number++;
194 }
195 return number;
196 }
197
198
199
200
201
202
203
204
205
206
207 protected String getFileName(int number, boolean withpath) {
208 String fn = "sitemap_google.xml";
209 if (number > 1) {
210 fn = "sitemap_google_" + number_format.format(number - 1) + ".xml";
211 }
212 String localPath = fn;
213 if (CDIR.length() != 0) {
214 localPath = CDIR + File.separator + fn;
215 }
216 if (withpath) {
217 return webappBaseDir + File.separator + localPath;
218 }
219 return localPath;
220 }
221
222
223
224
225
226
227 protected Document buildSingleSitemap() throws Exception {
228 LOGGER.debug("Build Google URL sitemap_google.xml for whole items.");
229
230 Element urlset = new Element("urlset", NS);
231 urlset.addNamespaceDeclaration(XSI_NAMESPACE);
232 urlset.setAttribute("noNamespaceSchemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
233 Document jdom = new Document(urlset);
234
235 for (MCRObjectIDDate objectIDDate : objidlist) {
236 urlset.addContent(buildURLElement(objectIDDate));
237 }
238 return jdom;
239 }
240
241
242
243
244
245
246
247
248 protected Document buildPartSitemap(int number) throws Exception {
249 LOGGER.debug("Build Google URL sitemap list number {}", Integer.toString(number));
250
251 Element urlset = new Element("urlset", NS);
252 urlset.addNamespaceDeclaration(XSI_NAMESPACE);
253 urlset.setAttribute("schemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
254 Document jdom = new Document(urlset);
255
256
257 int start = numberOfURLs * (number);
258 int stop = numberOfURLs * (number + 1);
259 if (stop > objidlist.size()) {
260 stop = objidlist.size();
261 }
262 LOGGER.debug("Build Google URL in range from {} to {}.", Integer.toString(start), Integer.toString(stop - 1));
263 for (int i = start; i < stop; i++) {
264 MCRObjectIDDate objectIDDate = objidlist.get(i);
265 urlset.addContent(buildURLElement(objectIDDate));
266
267 }
268 return jdom;
269 }
270
271 private Element buildURLElement(MCRObjectIDDate objectIDDate) {
272 String mcrID = objectIDDate.getId();
273 StringBuilder sb = new StringBuilder(1024);
274 sb.append(baseurl).append(OBJECT_PATH).append(mcrID);
275 if ((STYLE != null) && (STYLE.trim().length() > 0)) {
276 sb.append("?XSL.Style=").append(STYLE);
277 }
278
279 Element url = new Element("url", NS);
280 url.addContent(new Element("loc", NS).addContent(sb.toString()));
281 String datestr = formatter.format(objectIDDate.getLastModified());
282 url.addContent(new Element("lastmod", NS).addContent(datestr));
283 url.addContent(new Element("changefreq", NS).addContent(FREQ));
284 return url;
285 }
286
287
288
289
290
291
292
293
294 protected Document buildSitemapIndex(int number) {
295 LOGGER.debug("Build Google sitemap number {}", Integer.toString(number));
296
297 Element index = new Element("sitemapindex", NS);
298 index.addNamespaceDeclaration(XSI_NAMESPACE);
299 index.setAttribute("schemaLocation", SITEINDEX_SCHEMA, XSI_NAMESPACE);
300 Document jdom = new Document(index);
301
302 for (int i = 0; i < number; i++) {
303 Element sitemap = new Element("sitemap", NS);
304 index.addContent(sitemap);
305 sitemap.addContent(new Element("loc", NS).addContent((baseurl + getFileName(i + 2, false)).trim()));
306 String datestr = formatter.format((new GregorianCalendar(SITEMAP_TIMEZONE, SITEMAP_LOCALE)).getTime());
307 sitemap.addContent(new Element("lastmod", NS).addContent(datestr.trim()));
308 }
309 return jdom;
310 }
311
312
313
314
315 protected void removeSitemapFiles() {
316 File dir = new File(webappBaseDir, CDIR);
317 File[] li = dir.listFiles();
318 if (li != null) {
319 for (File fi : li) {
320 if (fi.getName().startsWith("sitemap_google")) {
321 LOGGER.debug("Remove file {}", fi.getName());
322 fi.delete();
323 }
324 }
325 }
326 }
327 }