use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackMarkup.
// unpackMarkup
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. XML) into annotations in GATE format. Uses the
* markupElementsMap to determine which elements to convert, and what
* annotation type names to use. If the document was created from a
* String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
* So, if the document has a valid URL, then the parser will try to
* parse the XML document pointed by the URL.If the URL is not valid,
* or is null, then the doc's content will be parsed. If the doc's
* content is not a valid XML then the parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the XML document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// determine whether we have a GATE format XML document or another
// kind
String content = doc.getContent().toString();
if (content.length() > 2048) {
content = content.substring(0, 2048);
}
boolean gateFormat = isGateXmlFormat(content);
if (gateFormat) {
unpackGateFormatMarkup(doc, statusListener);
} else {
unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
}
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackGateFormatMarkup.
/**
* Unpacks markup in the GATE-specific standoff XML markup format.
*
* @param doc the document to process
* @param statusListener optional status listener to receive status
* messages
* @throws DocumentFormatException if a fatal error occurs during
* parsing
*/
private void unpackGateFormatMarkup(Document doc, StatusListener statusListener) throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
try {
Reader inputReader = null;
InputStream inputStream = null;
XMLStreamReader xsr = null;
if (docHasContentButNoValidURL) {
inputReader = new StringReader(doc.getContent().toString());
xsr = getInputFactory().createXMLStreamReader(inputReader);
} else if (doc instanceof TextualDocument) {
String encoding = ((TextualDocument) doc).getEncoding();
// Don't strip BOM on XML.
inputReader = new InputStreamReader(doc.getSourceUrl().openStream(), encoding);
// create stream reader with the URL as system ID, to support
// relative URLs to e.g. DTD or external entities
xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputReader);
} else {
// not a TextualDocument, so let parser determine encoding
inputStream = doc.getSourceUrl().openStream();
xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputStream);
}
// find the opening GateDocument tag
xsr.nextTag();
// parse the document
try {
DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
} finally {
xsr.close();
if (inputStream != null) {
inputStream.close();
}
if (inputReader != null) {
inputReader.close();
}
}
} catch (XMLStreamException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} catch (IOException ioe) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), ioe);
}
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class TikaFormat method unpackMarkup.
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if (doc == null || doc.getSourceUrl() == null) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
Metadata metadata = extractParserTips(doc);
ch.addStatusListener(statusListener);
ch.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
ch.setAmpCodingInfo(ampCodingInfo);
InputStream input = null;
try {
Parser tikaParser = new TikaConfig().getParser();
input = doc.getSourceUrl().openStream();
tikaParser.parse(input, ch, metadata, new ParseContext());
setDocumentFeatures(metadata, doc);
} catch (IOException e) {
throw new DocumentFormatException(e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (TikaException e) {
throw new DocumentFormatException(e);
} finally {
// null safe
IOUtils.closeQuietly(input);
ch.removeStatusListener(statusListener);
}
if (doc instanceof DocumentImpl) {
((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
}
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class NekoHtmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format into annotations in GATE format. If the document was
* created from a String, then is recomandable to set the doc's
* sourceUrl to <b>null</b>. So, if the document has a valid URL,
* then the parser will try to parse the XML document pointed by the
* URL.If the URL is not valid, or is null, then the doc's content
* will be parsed. If the doc's content is not a valid XML then the
* parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
NekoHtmlDocumentHandler handler = null;
try {
org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
// convert element and attribute names to lower case
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
// make parser augment infoset with location information
parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
// Create a new Xml document handler
handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
// Register a status listener with it
handler.addStatusListener(statusListener);
// set repositioning object
handler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
handler.setAmpCodingInfo(ampCodingInfo);
// construct the list of offsets for each line of the document
int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
handler.setLineOffsets(lineOffsets);
// set the handlers
parser.setDocumentHandler(handler);
parser.setErrorHandler(handler);
// Parse the XML Document with the appropriate encoding
XMLInputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// XML, so no BOM stripping.
URLConnection conn = doc.getSourceUrl().openConnection();
InputStream uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
Reader docReader = new InputStreamReader(uStream, docEncoding);
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
// since we control the encoding, tell the parser to ignore any
// meta http-equiv hints
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
} else {
// let the parser decide the encoding
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
}
/* The following line can forward an
* ArrayIndexOutOfBoundsException from
* org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
parser.parse(is);
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
}/* Handle IOException specially. */
catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
}/* Handle XNIException and ArrayIndexOutOfBoundsException:
* flag the parsing error and keep going. */
catch (Exception e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} finally {
if (handler != null)
handler.removeStatusListener(statusListener);
}
// End if else try
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class SgmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. SGML) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* The doc's content is first converted to a wel formed XML.
* If this succeddes then the document is saved into a temp file and parsed
* as an XML document.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
fireStatusChanged(text);
}
};
XmlDocumentHandler xmlDocHandler = null;
try {
Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
fireStatusChanged("Performing SGML to XML...");
// convert the SGML document
String xmlUri = sgml2Xml.convert();
fireStatusChanged("DONE !");
// Out.println("Conversion done..." + xmlUri);
// Out.println(sgml2Xml.convert());
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// Create a SAX parser
SAXParser parser = saxParserFactory.newSAXParser();
// use it
// create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
// register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
parser.parse(xmlUri, xmlDocHandler);
((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
} catch (ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ", e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
} finally {
if (xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}
// End try
}
Aggregations