use of gate.html.NekoHtmlDocumentHandler in project gate-core by GateNLP.
the class NekoHtmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format into annotations in GATE format. If the document was
* created from a String, then is recomandable to set the doc's
* sourceUrl to <b>null</b>. So, if the document has a valid URL,
* then the parser will try to parse the XML document pointed by the
* URL.If the URL is not valid, or is null, then the doc's content
* will be parsed. If the doc's content is not a valid XML then the
* parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
NekoHtmlDocumentHandler handler = null;
try {
org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
// convert element and attribute names to lower case
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
// make parser augment infoset with location information
parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
// Create a new Xml document handler
handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
// Register a status listener with it
handler.addStatusListener(statusListener);
// set repositioning object
handler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
handler.setAmpCodingInfo(ampCodingInfo);
// construct the list of offsets for each line of the document
int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
handler.setLineOffsets(lineOffsets);
// set the handlers
parser.setDocumentHandler(handler);
parser.setErrorHandler(handler);
// Parse the XML Document with the appropriate encoding
XMLInputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// XML, so no BOM stripping.
URLConnection conn = doc.getSourceUrl().openConnection();
InputStream uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
Reader docReader = new InputStreamReader(uStream, docEncoding);
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
// since we control the encoding, tell the parser to ignore any
// meta http-equiv hints
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
} else {
// let the parser decide the encoding
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
}
/* The following line can forward an
* ArrayIndexOutOfBoundsException from
* org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
parser.parse(is);
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
}/* Handle IOException specially. */
catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
}/* Handle XNIException and ArrayIndexOutOfBoundsException:
* flag the parsing error and keep going. */
catch (Exception e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} finally {
if (handler != null)
handler.removeStatusListener(statusListener);
}
// End if else try
}
Aggregations