001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.SortedMap;
012 import java.util.regex.PatternSyntaxException;
013
014 import javax.xml.stream.XMLInputFactory;
015 import javax.xml.stream.XMLStreamConstants;
016 import javax.xml.stream.XMLStreamException;
017 import javax.xml.stream.XMLStreamReader;
018
019 import org.maltparser.core.exception.MaltChainedException;
020 import org.maltparser.core.io.dataformat.DataFormatException;
021 import org.maltparser.core.io.dataformat.DataFormatInstance;
022 import org.maltparser.core.symbol.SymbolTable;
023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024 import org.maltparser.core.syntaxgraph.PhraseStructure;
025 import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026 import org.maltparser.core.syntaxgraph.TokenStructure;
027 import org.maltparser.core.syntaxgraph.edge.Edge;
028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030
031 /**
032 *
033 *
034 * @author Johan Hall
035 */
036 public class TigerXMLReader implements SyntaxGraphReader {
037 // private TigerXMLHeader header;
038 private XMLStreamReader reader;
039 private int sentenceCount;
040 private DataFormatInstance dataFormatInstance;
041 private StringBuffer ntid;
042 private final StringBuilder graphRootID;
043 // private StringBuilder elementContent;
044 // private StringBuilder valueName;
045 // private StringBuilder currentFeatureName;
046 // private Domain domain;
047 // private boolean collectChar = false;
048 private String optionString;
049 private String fileName = null;
050 private URL url = null;
051 private String charsetName;
052 private int nIterations;
053 private int cIterations;
054 private int START_ID_OF_NONTERMINALS = 500;
055 private boolean closeStream = true;
056
057 public TigerXMLReader() {
058 this.ntid = new StringBuffer();
059 // elementContent = new StringBuilder();
060 // valueName = new StringBuilder();
061 // currentFeatureName = new StringBuilder();
062 graphRootID = new StringBuilder();
063 nIterations = 1;
064 cIterations = 1;
065 }
066
067 private void reopen() throws MaltChainedException {
068 close();
069 if (fileName != null) {
070 open(fileName, charsetName);
071 } else if (url != null) {
072 open(url, charsetName);
073 } else {
074 throw new DataFormatException("The input stream cannot be reopen. ");
075 }
076 }
077
078 public void open(String fileName, String charsetName) throws MaltChainedException {
079 setFileName(fileName);
080 setCharsetName(charsetName);
081 try {
082 open(new FileInputStream(fileName), charsetName);
083 }catch (FileNotFoundException e) {
084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085 }
086 }
087 public void open(URL url, String charsetName) throws MaltChainedException {
088 setUrl(url);
089 setCharsetName(charsetName);
090 try {
091 open(url.openStream(), charsetName);
092 } catch (IOException e) {
093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094 }
095 }
096
097 public void open(InputStream is, String charsetName) throws MaltChainedException {
098 try {
099 if (is == System.in) {
100 closeStream = false;
101 }
102 open(new InputStreamReader(is, charsetName));
103 } catch (UnsupportedEncodingException e) {
104 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
105 }
106 }
107
108 private void open(InputStreamReader isr) throws MaltChainedException {
109 try {
110 XMLInputFactory factory = XMLInputFactory.newInstance();
111 setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
112 } catch (XMLStreamException e) {
113 throw new DataFormatException("XML input file could be opened. ", e);
114 }
115 setSentenceCount(0);
116 }
117
118 public void readProlog() throws MaltChainedException {
119
120 }
121
122 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
123 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
124 return false;
125 }
126 syntaxGraph.clear();
127 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
128 PhraseStructureNode parent = null;
129 PhraseStructureNode child = null;
130 // if (header == null) {
131 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
132 // }
133
134 try {
135 while (true) {
136 int event = reader.next();
137 if (event == XMLStreamConstants.START_ELEMENT) {
138 if (reader.getLocalName().length() == 0) {
139 continue;
140 }
141 if (reader.getLocalName().charAt(0) == 'e') {
142 // e -> edge, edgelabel
143 if (reader.getLocalName().length() == 4) { //edge
144 int childid = -1;
145 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
146
147 try {
148 if (indexSep != -1) {
149 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
150 } else {
151 childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
152 }
153 if (childid == -1) {
154 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
155 }
156 } catch (NumberFormatException e) {
157 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
158 }
159
160 if (childid < START_ID_OF_NONTERMINALS) {
161 child = phraseStructure.getTokenNode(childid);
162 } else {
163
164 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
165 }
166
167 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
168 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
169 for (String name : inputTables.keySet()) {
170 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
171 }
172 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
173 // domain = Domain.EL;
174 }
175 } else if (reader.getLocalName().charAt(0) == 'n') {
176 // n -> nt, nonterminals, name
177 if (reader.getLocalName().length() == 2) { // nt
178 final String id = reader.getAttributeValue(null, "id");
179 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
180 parent = phraseStructure.getPhraseStructureRoot();
181 } else {
182 int index = id.indexOf('_');
183 if (index != -1) {
184 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
185 }
186 }
187 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
188 for (String name : inputTables.keySet()) {
189 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
190 }
191 } else if (reader.getLocalName().equals("name")) { // name
192 // elementContent.setLength(0);
193 // collectChar = true;
194 }
195 } else if (reader.getLocalName().charAt(0) == 't') {
196 // t -> t, terminals
197 if (reader.getLocalName().length() == 1) { // t
198 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
199 child = syntaxGraph.addTokenNode();
200 for (String name : inputTables.keySet()) {
201 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
202 }
203 }
204 } else if (reader.getLocalName().charAt(0) == 's') {
205 // s -> subcorpus, secedge, s, secedgelabel
206 if (reader.getLocalName().length() == 1) { // s
207 String id = reader.getAttributeValue(null, "id");
208 boolean indexable = false;
209 int index = -1;
210 if (id != null && id.length() > 0) {
211 for (int i = 0, n = id.length(); i < n; i++) {
212 if (Character.isDigit(id.charAt(i))) {
213 if (index == -1) {
214 index = i;
215 }
216 indexable = true;
217 }
218 }
219 }
220 if (indexable) {
221 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
222 } else {
223 phraseStructure.setSentenceID(sentenceCount+1);
224 }
225 }
226 } else if (reader.getLocalName().charAt(0) == 'v') {
227 // v -> variable, value
228 // if (reader.getLocalName().equals("value")) {
229 // valueName.setLength(0);
230 // valueName.append(reader.getAttributeValue(null, "name"));
231 // elementContent.setLength(0);
232 // collectChar = true;
233 // }
234 } else {
235 // a -> annotation, author
236 // b -> body
237 // c -> corpus
238 // d -> date, description,
239 // f -> feature, format
240 // g -> graph
241 // h -> head, history
242 // m -> matches, match
243 if (reader.getLocalName().equals("graph")) {
244 graphRootID.setLength(0);
245 graphRootID.append(reader.getAttributeValue(null, "root"));
246 } else if (reader.getLocalName().equals("corpus")) {
247 // header.setCorpusID(reader.getAttributeValue(null, "id"));
248 // header.setCorpusID(reader.getAttributeValue(null, "version"));
249 } else if (reader.getLocalName().equals("feature")) {
250 // if (header != null) {
251 // currentFeatureName.setLength(0);
252 // currentFeatureName.append(reader.getAttributeValue(null, "name"));
253 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
254 // }
255 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
256 } else if (reader.getLocalName().equals("secedgelabel")) {
257 // domain = Domain.SEL;
258 } else if (reader.getLocalName().equals("author")) {
259 // elementContent.setLength(0);
260 // collectChar = true;
261 } else if (reader.getLocalName().equals("date")) {
262 // elementContent.setLength(0);
263 // collectChar = true;
264 } else if (reader.getLocalName().equals("description")) {
265 // elementContent.setLength(0);
266 // collectChar = true;
267 } else if (reader.getLocalName().equals("format")) {
268 // elementContent.setLength(0);
269 // collectChar = true;
270 } else if (reader.getLocalName().equals("history")) {
271 // elementContent.setLength(0);
272 // collectChar = true;
273 }
274 }
275 } else if (event == XMLStreamConstants.END_ELEMENT) {
276 if (reader.getLocalName().length() == 0) {
277 continue;
278 }
279 if (reader.getLocalName().charAt(0) == 'e') {
280 // e -> edge, edgelabel
281 } else if (reader.getLocalName().charAt(0) == 'n') {
282 // n -> nt, nonterminals, name
283 if (reader.getLocalName().equals("nt")) {
284 ntid.setLength(0);
285 }
286 else if (reader.getLocalName().equals("nonterminals")) {
287 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
288 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
289 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
290 for (String name : inputTables.keySet()) {
291 e.addLabel(inputTables.get(name), "--");
292 }
293 }
294 }
295 // else if (reader.getLocalName().equals("name")) {
296 // if (header != null) {
297 // header.setMetaName(elementContent.toString());
298 // }
299 // collectChar = false;
300 // }
301 } else if (reader.getLocalName().charAt(0) == 't') {
302 // t -> t, terminals
303 } else if (reader.getLocalName().charAt(0) == 's') {
304 // s -> subcorpus, secedge, s, secedgelabel
305 if (reader.getLocalName().equals("s")) {
306 if (syntaxGraph.hasTokens()) {
307 sentenceCount++;
308 }
309 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
310 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
311 }
312 return true;
313 }
314 } else if (reader.getLocalName().charAt(0) == 'v') {
315 // v -> variable, value
316 // if (reader.getLocalName().equals("value")) {
317 // if (header != null) {
318 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
319 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
320 // } else if (domain == Domain.EL) {
321 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
322 // } else if (domain == Domain.SEL) {
323 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
324 // }
325 // }
326 // collectChar = false;
327 // }
328 } else {
329 // a -> annotation, author
330 // b -> body
331 // c -> corpus
332 // d -> date, description,
333 // f -> feature, format
334 // g -> graph
335 // h -> head, history
336 // m -> matches, match
337 if (reader.getLocalName().equals("body")) {
338 //sentence = dataStructures.getSentence();
339 //phraseTree = dataStructures.getInPhraseTree();
340 //sentence.clear();
341 //phraseTree.clear();
342 //dataStructures.setLastProcessObject(true);
343 } else if (reader.getLocalName().equals("author")) {
344 // if (header != null) {
345 // header.setMetaAuthor(elementContent.toString());
346 // }
347 // collectChar = false;
348 } else if (reader.getLocalName().equals("date")) {
349 // if (header != null) {
350 // header.setMetaInDate(elementContent.toString());
351 // }
352 // collectChar = false;
353 } else if (reader.getLocalName().equals("description")) {
354 // if (header != null) {
355 // header.setMetaDescription(elementContent.toString());
356 // }
357 // collectChar = false;
358 } else if (reader.getLocalName().equals("format")) {
359 // if (header != null) {
360 // header.setMetaFormat(elementContent.toString());
361 // }
362 // collectChar = false;
363 } else if (reader.getLocalName().equals("history")) {
364 // if (header != null) {
365 // header.setMetaHistory(elementContent.toString());
366 // }
367 // collectChar = false;
368 } /* else if (reader.getLocalName().equals("annotation")) {
369 if (header != null) {
370 System.out.println(header.toTigerXML());
371 }
372 collectChar = false;
373 } */
374 }
375 } else if (event == XMLStreamConstants.END_DOCUMENT) {
376 if (syntaxGraph.hasTokens()) {
377 sentenceCount++;
378 }
379 if (cIterations < nIterations) {
380 cIterations++;
381 reopen();
382 return true;
383 }
384 return false;
385 } else if (event == XMLStreamConstants.CHARACTERS) {
386 // if (collectChar) {
387 // char[] ch = reader.getTextCharacters();
388 // final int size = reader.getTextStart()+reader.getTextLength();
389 // for (int i = reader.getTextStart(); i < size; i++) {
390 // elementContent.append(ch[i]);
391 // }
392 // }
393 }
394 }
395 } catch (XMLStreamException e) {
396 throw new DataFormatException("", e);
397 }
398 }
399
400 public int getSentenceCount() {
401 return sentenceCount;
402 }
403
404 public void setSentenceCount(int sentenceCount) {
405 this.sentenceCount = sentenceCount;
406 }
407
408 public XMLStreamReader getReader() {
409 return reader;
410 }
411
412 public void setReader(XMLStreamReader reader) {
413 this.reader = reader;
414 }
415
416 public void readEpilog() throws MaltChainedException {
417
418 }
419
420 public void close() throws MaltChainedException {
421 try {
422 if (reader != null) {
423 if (closeStream) {
424 reader.close();
425 }
426 reader = null;
427 }
428 } catch (XMLStreamException e) {
429 throw new DataFormatException("The XML input file could be closed. ", e);
430 }
431 }
432
433 public DataFormatInstance getDataFormatInstance() {
434 return dataFormatInstance;
435 }
436
437 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
438 this.dataFormatInstance = inputDataFormatInstance;
439 }
440
441 public String getOptions() {
442 return optionString;
443 }
444
445 public void setOptions(String optionString) throws MaltChainedException {
446 this.optionString = optionString;
447 String[] argv;
448 try {
449 argv = optionString.split("[_\\p{Blank}]");
450 } catch (PatternSyntaxException e) {
451 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
452 }
453 for (int i=0; i < argv.length-1; i++) {
454 if(argv[i].charAt(0) != '-') {
455 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
456 }
457 if(++i>=argv.length) {
458 throw new DataFormatException("The last argument does not have any value. ");
459 }
460 switch(argv[i-1].charAt(1)) {
461 case 's':
462 try {
463 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
464 } catch (NumberFormatException e){
465 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
466 }
467 break;
468 default:
469 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
470 }
471 }
472 }
473
474 public String getFileName() {
475 return fileName;
476 }
477
478 public void setFileName(String fileName) {
479 this.fileName = fileName;
480 }
481
482 public URL getUrl() {
483 return url;
484 }
485
486 public void setUrl(URL url) {
487 this.url = url;
488 }
489
490 public String getCharsetName() {
491 return charsetName;
492 }
493
494 public void setCharsetName(String charsetName) {
495 this.charsetName = charsetName;
496 }
497
498 public int getNIterations() {
499 return nIterations;
500 }
501
502 public void setNIterations(int iterations) {
503 nIterations = iterations;
504 }
505
506 public int getIterationCounter() {
507 return cIterations;
508 }
509 // public TigerXMLHeader getHeader() {
510 // return header;
511 // }
512 //
513 // public void setHeader(TigerXMLHeader header) {
514 // this.header = header;
515 // }
516 }