View Javadoc
1   /*
2    * This file is part of ***  M y C o R e  ***
3    * See http://www.mycore.de/ for details.
4    *
5    * MyCoRe is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * MyCoRe is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with MyCoRe.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  
19  package org.mycore.mods;
20  
21  import java.util.ArrayList;
22  import java.util.LinkedHashMap;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Map.Entry;
26  import java.util.Set;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  import org.apache.xpath.NodeSet;
31  import org.jdom2.Element;
32  import org.jdom2.JDOMException;
33  import org.jdom2.output.DOMOutputter;
34  import org.mycore.common.MCRConstants;
35  
36  /**
37   * Builds a mods:extent[@unit='pages'] element from text containing pages information. 
38   * For example, the input "pp. 3-4" will generate mods:start and mods:end elements. 
39   * Additionally, variants of hyphens are normalized to a unique character.
40   * Third, incomplete end page numbers are completed, e.g. S. 3845 - 53 will result in 
41   * mods:end=3853.
42   * 
43   * @author Frank L\u00FCtzenkirchen 
44   **/
45  public class MCRMODSPagesHelper {
46  
47      private static final HyphenNormalizer HYPHEN_NORMALIZER = new HyphenNormalizer();
48  
49      private static final EndPageCompleter END_PAGE_COMPLETER = new EndPageCompleter();
50  
51      private static final ExtentPagesBuilder EXTENT_PAGES_BUILDER = new ExtentPagesBuilder();
52  
53      public static Element buildExtentPages(String input) {
54          String normalizedInput = input.trim();
55          normalizedInput = HYPHEN_NORMALIZER.normalize(normalizedInput);
56          Element extent = EXTENT_PAGES_BUILDER.buildExtent(normalizedInput);
57          END_PAGE_COMPLETER.completeEndPage(extent);
58          return extent;
59      }
60  
61      public static NodeSet buildExtentPagesNodeSet(String input) throws JDOMException {
62          Element extent = buildExtentPages(input);
63          org.w3c.dom.Element domElement = new DOMOutputter().output(extent);
64          NodeSet nodeSet = new NodeSet();
65          nodeSet.addNode(domElement);
66          return nodeSet;
67      }
68  }
69  
70  /**
71   * Normalizes the different variants of hyphens in a given input text to a simple "minus" character.  
72   * 
73   * @author Frank L\u00FCtzenkirchen 
74   **/
75  class HyphenNormalizer {
76  
77      private static final char HYPHEN_NORM = '-';
78  
79      private char[] hyphenVariants = { '\u002D', '\u2010', '\u2011', '\u2012', '\u2013', '\u2015', '\u2212', '\u2E3B',
80          '\uFE58', '\uFE63', };
81  
82      String normalize(String input) {
83          String normalizedInput = input;
84          for (char hypenVariant : hyphenVariants) {
85              normalizedInput = normalizedInput.replace(hypenVariant, HYPHEN_NORM);
86          }
87          return normalizedInput;
88      }
89  }
90  
91  /**
92   * When start and end page are given, often only the differing prefix of the end page number is specified, e.g.
93   * "3845 - 53" meaning end page is 3853. This class completes the value of mods:end if start and end page
94   * are both numbers.    
95   * 
96   * @author Frank L\u00FCtzenkirchen 
97   **/
98  class EndPageCompleter {
99  
100     void completeEndPage(Element extent) {
101         String start = extent.getChildText("start", MCRConstants.MODS_NAMESPACE);
102         String end = extent.getChildText("end", MCRConstants.MODS_NAMESPACE);
103         if (isNumber(start) && isNumber(end) && start.length() > end.length()) {
104             end = start.substring(0, start.length() - end.length()) + end;
105             extent.getChild("end", MCRConstants.MODS_NAMESPACE).setText(end);
106         }
107     }
108 
109     boolean isNumber(String value) {
110         return ((value != null) && value.matches("\\d+"));
111     }
112 }
113 
114 /**
115  * Builds a mods:extent element containing appropriate child elements representing the same 
116  * pages information as some textual input. For example, the input "pp. 3-4" will generate mods:start and
117  * mods:end elements. 
118  * 
119  * @author Frank L\u00FCtzenkirchen 
120  **/
121 class ExtentPagesBuilder {
122 
123     private static final String OPTIONAL = "?";
124 
125     private static final String ZERO_OR_MORE = "*";
126 
127     private static final String ONE_OR_MORE = "+";
128 
129     private static final String NUMBER = "([0-9]+)";
130 
131     private static final String PAGENR = "([a-zA-Z0-9\\.]+)";
132 
133     private static final String SPACE = "\\s";
134 
135     private static final String SPACES = SPACE + ONE_OR_MORE;
136 
137     private static final String SPACES_OPTIONAL = SPACE + ZERO_OR_MORE;
138 
139     private static final String DOT = "\\.";
140 
141     private static final String HYPHEN = SPACES_OPTIONAL + "-" + SPACES_OPTIONAL;
142 
143     private static final String PAGENR_W_HYPHEN = "([a-zA-Z0-9-\\.]+)";
144 
145     private static final String HYPHEN_SEPARATED = SPACES + "-" + SPACES;
146 
147     private static final String PAGE = "([sSp]{1,2}" + DOT + OPTIONAL + SPACES_OPTIONAL + ")" + OPTIONAL;
148 
149     private static final String PAGES = "(pages|[Ss]eiten|S\\.)";
150 
151     private static final String FF = "(" + SPACES + "ff?\\.?)" + OPTIONAL;
152 
153     private List<PagesPattern> patterns = new ArrayList<>();
154 
155     ExtentPagesBuilder() {
156         PagesPattern startEnd = new PagesPattern(PAGE + PAGENR + HYPHEN + PAGENR + DOT + OPTIONAL);
157         startEnd.addMapping("start", 2);
158         startEnd.addMapping("end", 3);
159         patterns.add(startEnd);
160 
161         PagesPattern startEndVariant = new PagesPattern(
162             PAGE + PAGENR_W_HYPHEN + HYPHEN_SEPARATED + PAGENR_W_HYPHEN + DOT + OPTIONAL);
163         startEndVariant.addMapping("start", 2);
164         startEndVariant.addMapping("end", 3);
165         patterns.add(startEndVariant);
166 
167         PagesPattern startTotal = new PagesPattern(
168             PAGE + PAGENR + SPACES + "\\(" + NUMBER + SPACES_OPTIONAL + PAGES + "\\)");
169         startTotal.addMapping("start", 2);
170         startTotal.addMapping("total", 3);
171         patterns.add(startTotal);
172 
173         PagesPattern startOnly = new PagesPattern(PAGE + PAGENR + FF);
174         startOnly.addMapping("start", 2);
175         patterns.add(startOnly);
176 
177         PagesPattern totalOnly = new PagesPattern("\\(?" + PAGENR + SPACES + PAGES + OPTIONAL + "\\)?");
178         totalOnly.addMapping("total", 1);
179         patterns.add(totalOnly);
180 
181         PagesPattern list = new PagesPattern("(.+)");
182         list.addMapping("list", 1);
183         patterns.add(list);
184     }
185 
186     /**
187      * Builds a mods:extent element containing appropriate child elements representing the same 
188      * pages information as the textual input. For example, the input "3-4" will generate mods:start and
189      * mods:end elements. 
190      * 
191      * @param input the textual pages information, e.g. "S. 3-4" or "p. 123 (9 pages)" 
192      */
193     Element buildExtent(String input) {
194         Element extent = buildExtent();
195 
196         for (PagesPattern pattern : patterns) {
197             PagesMatcher matcher = pattern.matcher(input);
198             if (matcher.matches()) {
199                 matcher.addMODSto(extent);
200                 break;
201             }
202         }
203         return extent;
204     }
205 
206     /** Builds a new mods:extent element to hold pages information */
207     private Element buildExtent() {
208         Element extent = new Element("extent", MCRConstants.MODS_NAMESPACE);
209         extent.setAttribute("unit", "pages");
210         return extent;
211     }
212 }
213 
214 /**
215  * Represents a text pattern containing pages information, like start & end page,
216  * or start and total number of pages as textual representation. Manages a mapping
217  * between matching groups in pattern and the corresponding MODS elements, e.g. 
218  * mods:start and mods:end.
219  * 
220  * @author Frank L\u00FCtzenkirchen 
221  **/
222 class PagesPattern {
223 
224     private Pattern pattern;
225 
226     /** Mapping from MODS Element name to group number in the pattern */
227     private Map<String, Integer> mods2group = new LinkedHashMap<>();
228 
229     PagesPattern(String regularExpression) {
230         pattern = Pattern.compile(regularExpression);
231     }
232 
233     /**
234      * Add a mapping from MODS Element name to group number in the pattern.
235      * 
236      * @param modsElement the name of the MODS element mapped, e.g. "start"
237      * @param groupNumber number of the group in the regular expression pattern
238      **/
239     void addMapping(String modsElement, int groupNumber) {
240         mods2group.put(modsElement, groupNumber);
241     }
242 
243     /**
244      * Returns the mappings from MODS Element name to group number
245      */
246     Set<Entry<String, Integer>> getMappings() {
247         return mods2group.entrySet();
248     }
249 
250     /**
251      * Returns a matcher for the given input text
252      */
253     PagesMatcher matcher(String input) {
254         return new PagesMatcher(this, pattern.matcher(input));
255     }
256 
257 }
258 
259 /**
260  * Represents a matcher for a given input text containing pages information,
261  * associated with a PagesPattern that possibly matches the text.
262  * 
263  * @author Frank L\u00FCtzenkirchen 
264  **/
265 class PagesMatcher {
266 
267     PagesPattern pattern;
268 
269     Matcher matcher;
270 
271     PagesMatcher(PagesPattern pattern, Matcher matcher) {
272         this.pattern = pattern;
273         this.matcher = matcher;
274     }
275 
276     boolean matches() {
277         return matcher.matches();
278     }
279 
280     /**
281      * If matches(), adds MODS elements mapped to the matching groups in the pattern.
282      * Note that matches() MUST be called first!
283      * 
284      * @param extent the mods:extent element to add new MODS elements to
285      */
286     void addMODSto(Element extent) {
287         for (Entry<String, Integer> mapping : pattern.getMappings()) {
288             int group = mapping.getValue();
289             String name = mapping.getKey();
290             Element mods = new Element(name, MCRConstants.MODS_NAMESPACE);
291             String value = matcher.group(group);
292             mods.setText(value);
293             extent.addContent(mods);
294         }
295     }
296 }