use of gate.TextualDocument in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackGateFormatMarkup.
/**
* Unpacks markup in the GATE-specific standoff XML markup format.
*
* @param doc the document to process
* @param statusListener optional status listener to receive status
* messages
* @throws DocumentFormatException if a fatal error occurs during
* parsing
*/
private void unpackGateFormatMarkup(Document doc, StatusListener statusListener) throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
try {
Reader inputReader = null;
InputStream inputStream = null;
XMLStreamReader xsr = null;
if (docHasContentButNoValidURL) {
inputReader = new StringReader(doc.getContent().toString());
xsr = getInputFactory().createXMLStreamReader(inputReader);
} else if (doc instanceof TextualDocument) {
String encoding = ((TextualDocument) doc).getEncoding();
// Don't strip BOM on XML.
inputReader = new InputStreamReader(doc.getSourceUrl().openStream(), encoding);
// create stream reader with the URL as system ID, to support
// relative URLs to e.g. DTD or external entities
xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputReader);
} else {
// not a TextualDocument, so let parser determine encoding
inputStream = doc.getSourceUrl().openStream();
xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputStream);
}
// find the opening GateDocument tag
xsr.nextTag();
// parse the document
try {
DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
} finally {
xsr.close();
if (inputStream != null) {
inputStream.close();
}
if (inputReader != null) {
inputReader.close();
}
}
} catch (XMLStreamException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} catch (IOException ioe) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), ioe);
}
}
use of gate.TextualDocument in project gate-core by GateNLP.
the class DocumentStaxUtils method writeDocument.
public static void writeDocument(Document doc, OutputStream outputStream, String namespaceURI) throws XMLStreamException, IOException {
if (outputFactory == null) {
outputFactory = XMLOutputFactory.newInstance();
}
XMLStreamWriter xsw = null;
try {
if (doc instanceof TextualDocument) {
xsw = outputFactory.createXMLStreamWriter(outputStream, ((TextualDocument) doc).getEncoding());
xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
} else {
xsw = outputFactory.createXMLStreamWriter(outputStream);
xsw.writeStartDocument("1.0");
}
newLine(xsw);
writeDocument(doc, xsw, namespaceURI);
} finally {
if (xsw != null) {
xsw.close();
}
}
}
use of gate.TextualDocument in project gate-core by GateNLP.
the class DocumentStaxUtils method toXml.
/**
* Returns a string containing the specified document in GATE XML
* format.
*
* @param doc the document
*/
public static String toXml(Document doc) {
try {
if (outputFactory == null) {
outputFactory = XMLOutputFactory.newInstance();
}
StringWriter sw = new StringWriter(doc.getContent().size().intValue() * DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR);
XMLStreamWriter xsw = outputFactory.createXMLStreamWriter(sw);
// start the document
if (doc instanceof TextualDocument) {
xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
} else {
xsw.writeStartDocument("1.0");
}
newLine(xsw);
writeDocument(doc, xsw, "");
xsw.close();
return sw.toString();
} catch (XMLStreamException xse) {
throw new GateRuntimeException("Error converting document to XML", xse);
}
}
use of gate.TextualDocument in project gate-core by GateNLP.
the class NekoHtmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format into annotations in GATE format. If the document was
* created from a String, then is recomandable to set the doc's
* sourceUrl to <b>null</b>. So, if the document has a valid URL,
* then the parser will try to parse the XML document pointed by the
* URL.If the URL is not valid, or is null, then the doc's content
* will be parsed. If the doc's content is not a valid XML then the
* parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
NekoHtmlDocumentHandler handler = null;
try {
org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
// convert element and attribute names to lower case
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
// make parser augment infoset with location information
parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
// Create a new Xml document handler
handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
// Register a status listener with it
handler.addStatusListener(statusListener);
// set repositioning object
handler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
handler.setAmpCodingInfo(ampCodingInfo);
// construct the list of offsets for each line of the document
int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
handler.setLineOffsets(lineOffsets);
// set the handlers
parser.setDocumentHandler(handler);
parser.setErrorHandler(handler);
// Parse the XML Document with the appropriate encoding
XMLInputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// XML, so no BOM stripping.
URLConnection conn = doc.getSourceUrl().openConnection();
InputStream uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
Reader docReader = new InputStreamReader(uStream, docEncoding);
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
// since we control the encoding, tell the parser to ignore any
// meta http-equiv hints
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
} else {
// let the parser decide the encoding
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
}
/* The following line can forward an
* ArrayIndexOutOfBoundsException from
* org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
parser.parse(is);
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
}/* Handle IOException specially. */
catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
}/* Handle XNIException and ArrayIndexOutOfBoundsException:
* flag the parsing error and keep going. */
catch (Exception e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} finally {
if (handler != null)
handler.removeStatusListener(statusListener);
}
// End if else try
}
use of gate.TextualDocument in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackGeneralXmlMarkup.
/**
* Unpack markup from any XML format. The XML elements are translated
* to annotations on the Original markups annotation set.
*
* @param doc the document to process
* @throws DocumentFormatException
*/
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
XmlDocumentHandler xmlDocHandler = null;
try {
// use Xerces XML parser with JAXP
// System.setProperty("javax.xml.parsers.SAXParserFactory",
// "org.apache.xerces.jaxp.SAXParserFactoryImpl");
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// create it
SAXParser xmlParser = saxParserFactory.newSAXParser();
// Create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
// Register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
// set repositioning object
xmlDocHandler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
// Set up the factory to create the appropriate type of parser
// non validating one
// http://xml.org/sax/features/validation set to false
newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
// namesapace aware one
// http://xml.org/sax/features/namespaces set to true
newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
newxmlParser.setContentHandler(xmlDocHandler);
newxmlParser.setErrorHandler(xmlDocHandler);
newxmlParser.setDTDHandler(xmlDocHandler);
newxmlParser.setEntityResolver(xmlDocHandler);
// Parse the XML Document with the appropriate encoding
Reader docReader = null;
try {
InputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new InputSource(new StringReader(doc.getContent().toString()));
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// don't strip BOM on XML.
docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
is = new InputSource(docReader);
// must set system ID to allow relative URLs (e.g. to a DTD) to
// work
is.setSystemId(doc.getSourceUrl().toString());
} else {
// let the parser decide the encoding
is = new InputSource(doc.getSourceUrl().toString());
}
newxmlParser.parse(is);
} finally {
// make sure the open streams are closed
if (docReader != null)
docReader.close();
}
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
} catch (ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ", e);
} catch (SAXException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
} finally {
if (xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}
// End if else try
}
Aggregations