1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.mycore.mods;
20
21 import java.util.ArrayList;
22 import java.util.LinkedHashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Map.Entry;
26 import java.util.Set;
27 import java.util.regex.Matcher;
28 import java.util.regex.Pattern;
29
30 import org.apache.xpath.NodeSet;
31 import org.jdom2.Element;
32 import org.jdom2.JDOMException;
33 import org.jdom2.output.DOMOutputter;
34 import org.mycore.common.MCRConstants;
35
36
37
38
39
40
41
42
43
44
45 public class MCRMODSPagesHelper {
46
47 private static final HyphenNormalizer HYPHEN_NORMALIZER = new HyphenNormalizer();
48
49 private static final EndPageCompleter END_PAGE_COMPLETER = new EndPageCompleter();
50
51 private static final ExtentPagesBuilder EXTENT_PAGES_BUILDER = new ExtentPagesBuilder();
52
53 public static Element buildExtentPages(String input) {
54 String normalizedInput = input.trim();
55 normalizedInput = HYPHEN_NORMALIZER.normalize(normalizedInput);
56 Element extent = EXTENT_PAGES_BUILDER.buildExtent(normalizedInput);
57 END_PAGE_COMPLETER.completeEndPage(extent);
58 return extent;
59 }
60
61 public static NodeSet buildExtentPagesNodeSet(String input) throws JDOMException {
62 Element extent = buildExtentPages(input);
63 org.w3c.dom.Element domElement = new DOMOutputter().output(extent);
64 NodeSet nodeSet = new NodeSet();
65 nodeSet.addNode(domElement);
66 return nodeSet;
67 }
68 }
69
70
71
72
73
74
75 class HyphenNormalizer {
76
77 private static final char HYPHEN_NORM = '-';
78
79 private char[] hyphenVariants = { '\u002D', '\u2010', '\u2011', '\u2012', '\u2013', '\u2015', '\u2212', '\u2E3B',
80 '\uFE58', '\uFE63', };
81
82 String normalize(String input) {
83 String normalizedInput = input;
84 for (char hypenVariant : hyphenVariants) {
85 normalizedInput = normalizedInput.replace(hypenVariant, HYPHEN_NORM);
86 }
87 return normalizedInput;
88 }
89 }
90
91
92
93
94
95
96
97
98 class EndPageCompleter {
99
100 void completeEndPage(Element extent) {
101 String start = extent.getChildText("start", MCRConstants.MODS_NAMESPACE);
102 String end = extent.getChildText("end", MCRConstants.MODS_NAMESPACE);
103 if (isNumber(start) && isNumber(end) && start.length() > end.length()) {
104 end = start.substring(0, start.length() - end.length()) + end;
105 extent.getChild("end", MCRConstants.MODS_NAMESPACE).setText(end);
106 }
107 }
108
109 boolean isNumber(String value) {
110 return ((value != null) && value.matches("\\d+"));
111 }
112 }
113
114
115
116
117
118
119
120
121 class ExtentPagesBuilder {
122
123 private static final String OPTIONAL = "?";
124
125 private static final String ZERO_OR_MORE = "*";
126
127 private static final String ONE_OR_MORE = "+";
128
129 private static final String NUMBER = "([0-9]+)";
130
131 private static final String PAGENR = "([a-zA-Z0-9\\.]+)";
132
133 private static final String SPACE = "\\s";
134
135 private static final String SPACES = SPACE + ONE_OR_MORE;
136
137 private static final String SPACES_OPTIONAL = SPACE + ZERO_OR_MORE;
138
139 private static final String DOT = "\\.";
140
141 private static final String HYPHEN = SPACES_OPTIONAL + "-" + SPACES_OPTIONAL;
142
143 private static final String PAGENR_W_HYPHEN = "([a-zA-Z0-9-\\.]+)";
144
145 private static final String HYPHEN_SEPARATED = SPACES + "-" + SPACES;
146
147 private static final String PAGE = "([sSp]{1,2}" + DOT + OPTIONAL + SPACES_OPTIONAL + ")" + OPTIONAL;
148
149 private static final String PAGES = "(pages|[Ss]eiten|S\\.)";
150
151 private static final String FF = "(" + SPACES + "ff?\\.?)" + OPTIONAL;
152
153 private List<PagesPattern> patterns = new ArrayList<>();
154
155 ExtentPagesBuilder() {
156 PagesPattern startEnd = new PagesPattern(PAGE + PAGENR + HYPHEN + PAGENR + DOT + OPTIONAL);
157 startEnd.addMapping("start", 2);
158 startEnd.addMapping("end", 3);
159 patterns.add(startEnd);
160
161 PagesPattern startEndVariant = new PagesPattern(
162 PAGE + PAGENR_W_HYPHEN + HYPHEN_SEPARATED + PAGENR_W_HYPHEN + DOT + OPTIONAL);
163 startEndVariant.addMapping("start", 2);
164 startEndVariant.addMapping("end", 3);
165 patterns.add(startEndVariant);
166
167 PagesPattern startTotal = new PagesPattern(
168 PAGE + PAGENR + SPACES + "\\(" + NUMBER + SPACES_OPTIONAL + PAGES + "\\)");
169 startTotal.addMapping("start", 2);
170 startTotal.addMapping("total", 3);
171 patterns.add(startTotal);
172
173 PagesPattern startOnly = new PagesPattern(PAGE + PAGENR + FF);
174 startOnly.addMapping("start", 2);
175 patterns.add(startOnly);
176
177 PagesPattern totalOnly = new PagesPattern("\\(?" + PAGENR + SPACES + PAGES + OPTIONAL + "\\)?");
178 totalOnly.addMapping("total", 1);
179 patterns.add(totalOnly);
180
181 PagesPattern list = new PagesPattern("(.+)");
182 list.addMapping("list", 1);
183 patterns.add(list);
184 }
185
186
187
188
189
190
191
192
193 Element buildExtent(String input) {
194 Element extent = buildExtent();
195
196 for (PagesPattern pattern : patterns) {
197 PagesMatcher matcher = pattern.matcher(input);
198 if (matcher.matches()) {
199 matcher.addMODSto(extent);
200 break;
201 }
202 }
203 return extent;
204 }
205
206
207 private Element buildExtent() {
208 Element extent = new Element("extent", MCRConstants.MODS_NAMESPACE);
209 extent.setAttribute("unit", "pages");
210 return extent;
211 }
212 }
213
214
215
216
217
218
219
220
221
222 class PagesPattern {
223
224 private Pattern pattern;
225
226
227 private Map<String, Integer> mods2group = new LinkedHashMap<>();
228
229 PagesPattern(String regularExpression) {
230 pattern = Pattern.compile(regularExpression);
231 }
232
233
234
235
236
237
238
239 void addMapping(String modsElement, int groupNumber) {
240 mods2group.put(modsElement, groupNumber);
241 }
242
243
244
245
246 Set<Entry<String, Integer>> getMappings() {
247 return mods2group.entrySet();
248 }
249
250
251
252
253 PagesMatcher matcher(String input) {
254 return new PagesMatcher(this, pattern.matcher(input));
255 }
256
257 }
258
259
260
261
262
263
264
265 class PagesMatcher {
266
267 PagesPattern pattern;
268
269 Matcher matcher;
270
271 PagesMatcher(PagesPattern pattern, Matcher matcher) {
272 this.pattern = pattern;
273 this.matcher = matcher;
274 }
275
276 boolean matches() {
277 return matcher.matches();
278 }
279
280
281
282
283
284
285
286 void addMODSto(Element extent) {
287 for (Entry<String, Integer> mapping : pattern.getMappings()) {
288 int group = mapping.getValue();
289 String name = mapping.getKey();
290 Element mods = new Element(name, MCRConstants.MODS_NAMESPACE);
291 String value = matcher.group(group);
292 mods.setText(value);
293 extent.addContent(mods);
294 }
295 }
296 }