use of gate.event.StatusListener in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackMarkup.
// unpackMarkup
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. XML) into annotations in GATE format. Uses the
* markupElementsMap to determine which elements to convert, and what
* annotation type names to use. If the document was created from a
* String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
* So, if the document has a valid URL, then the parser will try to
* parse the XML document pointed by the URL.If the URL is not valid,
* or is null, then the doc's content will be parsed. If the doc's
* content is not a valid XML then the parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the XML document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// determine whether we have a GATE format XML document or another
// kind
String content = doc.getContent().toString();
if (content.length() > 2048) {
content = content.substring(0, 2048);
}
boolean gateFormat = isGateXmlFormat(content);
if (gateFormat) {
unpackGateFormatMarkup(doc, statusListener);
} else {
unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class PersistenceManager method getPersistentRepresentation.
/**
* Recursively traverses the provided object and replaces it and all
* its contents with the appropriate persistent equivalent classes.
*
* @param target the object to be analysed and translated into a
* persistent equivalent.
* @return the persistent equivalent value for the provided target
*/
public static Serializable getPersistentRepresentation(Object target) throws PersistenceException {
if (target == null)
return null;
// first check we don't have it already
Persistence res = existingPersistentReplacements.get().getFirst().get(new ObjectHolder(target));
if (res != null)
return res;
Class<? extends Object> type = target.getClass();
Class<?> newType = getMostSpecificPersistentType(type);
if (newType == null) {
// no special handler
if (target instanceof Serializable)
return (Serializable) target;
else
throw new PersistenceException("Could not find a serialisable replacement for " + type);
}
// we have a new type; create the new object, populate and return it
try {
res = (Persistence) newType.newInstance();
} catch (Exception e) {
throw new PersistenceException(e);
}
if (target instanceof NameBearer) {
StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null) {
sListener.statusChanged("Storing " + ((NameBearer) target).getName());
}
}
res.extractDataFromSource(target);
existingPersistentReplacements.get().getFirst().put(new ObjectHolder(target), res);
return res;
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class TikaFormat method unpackMarkup.
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if (doc == null || doc.getSourceUrl() == null) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
Metadata metadata = extractParserTips(doc);
ch.addStatusListener(statusListener);
ch.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
ch.setAmpCodingInfo(ampCodingInfo);
InputStream input = null;
try {
Parser tikaParser = new TikaConfig().getParser();
input = doc.getSourceUrl().openStream();
tikaParser.parse(input, ch, metadata, new ParseContext());
setDocumentFeatures(metadata, doc);
} catch (IOException e) {
throw new DocumentFormatException(e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (TikaException e) {
throw new DocumentFormatException(e);
} finally {
// null safe
IOUtils.closeQuietly(input);
ch.removeStatusListener(statusListener);
}
if (doc instanceof DocumentImpl) {
((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
}
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class DocumentStaxUtils method writeDocument.
/**
* Write the specified GATE Document to an XMLStreamWriter. This
* method writes just the GateDocument element - the XML declaration
* must be filled in by the caller if required.
*
* @param doc the Document to write
* @param annotationSets the annotations to include. If the map
* contains an entry for the key <code>null</code>, this
* will be treated as the default set. All other entries are
* treated as named annotation sets.
* @param xsw the StAX XMLStreamWriter to use for output
* @throws GateException if an error occurs during writing
*/
public static void writeDocument(Document doc, Map<String, Collection<Annotation>> annotationSets, XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
xsw.setDefaultNamespace(namespaceURI);
xsw.writeStartElement(namespaceURI, "GateDocument");
xsw.writeAttribute("version", GATE_XML_VERSION);
if (namespaceURI.length() > 0) {
xsw.writeDefaultNamespace(namespaceURI);
}
newLine(xsw);
// features
xsw.writeComment(" The document's features");
newLine(xsw);
newLine(xsw);
xsw.writeStartElement(namespaceURI, "GateDocumentFeatures");
newLine(xsw);
writeFeatures(doc.getFeatures(), xsw, namespaceURI);
// GateDocumentFeatures
xsw.writeEndElement();
newLine(xsw);
// text with nodes
xsw.writeComment(" The document content area with serialized nodes ");
newLine(xsw);
newLine(xsw);
writeTextWithNodes(doc, annotationSets.values(), xsw, namespaceURI);
newLine(xsw);
// Serialize as XML all document's annotation sets
// Serialize the default AnnotationSet
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
if (annotationSets.containsKey(null)) {
if (sListener != null)
sListener.statusChanged("Saving the default annotation set ");
xsw.writeComment(" The default annotation set ");
newLine(xsw);
newLine(xsw);
writeAnnotationSet(annotationSets.get(null), null, xsw, namespaceURI);
newLine(xsw);
}
// while(iter.hasNext()) {
for (Map.Entry<String, Collection<Annotation>> entry : annotationSets.entrySet()) {
// iter.next();
String annotationSetName = entry.getKey();
// above
if (annotationSetName != null) {
// annotationSets.get(annotationSetName);
Collection<Annotation> annots = entry.getValue();
xsw.writeComment(" Named annotation set ");
newLine(xsw);
newLine(xsw);
// Serialize it as XML
if (sListener != null)
sListener.statusChanged("Saving " + annotationSetName + " annotation set ");
writeAnnotationSet(annots, annotationSetName, xsw, namespaceURI);
newLine(xsw);
}
// End if
}
// End while
Iterator<String> iter = annotationSets.keySet().iterator();
while (iter.hasNext()) {
writeRelationSet(doc.getAnnotations(iter.next()).getRelations(), xsw, namespaceURI);
}
// close the GateDocument element
xsw.writeEndElement();
newLine(xsw);
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class NekoHtmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format into annotations in GATE format. If the document was
* created from a String, then is recomandable to set the doc's
* sourceUrl to <b>null</b>. So, if the document has a valid URL,
* then the parser will try to parse the XML document pointed by the
* URL.If the URL is not valid, or is null, then the doc's content
* will be parsed. If the doc's content is not a valid XML then the
* parser might crash.
*
* @param doc The gate document you want to parse. If
* <code>doc.getSourceUrl()</code> returns <b>null</b>
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
NekoHtmlDocumentHandler handler = null;
try {
org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
// convert element and attribute names to lower case
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
// make parser augment infoset with location information
parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
// Create a new Xml document handler
handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
// Register a status listener with it
handler.addStatusListener(statusListener);
// set repositioning object
handler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
handler.setAmpCodingInfo(ampCodingInfo);
// construct the list of offsets for each line of the document
int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
handler.setLineOffsets(lineOffsets);
// set the handlers
parser.setDocumentHandler(handler);
parser.setErrorHandler(handler);
// Parse the XML Document with the appropriate encoding
XMLInputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// XML, so no BOM stripping.
URLConnection conn = doc.getSourceUrl().openConnection();
InputStream uStream = conn.getInputStream();
if ("gzip".equals(conn.getContentEncoding())) {
uStream = new GZIPInputStream(uStream);
}
Reader docReader = new InputStreamReader(uStream, docEncoding);
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
// since we control the encoding, tell the parser to ignore any
// meta http-equiv hints
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
} else {
// let the parser decide the encoding
is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
}
/* The following line can forward an
* ArrayIndexOutOfBoundsException from
* org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
parser.parse(is);
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
}/* Handle IOException specially. */
catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
}/* Handle XNIException and ArrayIndexOutOfBoundsException:
* flag the parsing error and keep going. */
catch (Exception e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} finally {
if (handler != null)
handler.removeStatusListener(statusListener);
}
// End if else try
}
Aggregations