use of gate.event.StatusListener in project gate-core by GateNLP.
the class SgmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. SGML) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* The doc's content is first converted to a wel formed XML.
* If this succeddes then the document is saved into a temp file and parsed
* as an XML document.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
fireStatusChanged(text);
}
};
XmlDocumentHandler xmlDocHandler = null;
try {
Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
fireStatusChanged("Performing SGML to XML...");
// convert the SGML document
String xmlUri = sgml2Xml.convert();
fireStatusChanged("DONE !");
// Out.println("Conversion done..." + xmlUri);
// Out.println(sgml2Xml.convert());
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// Create a SAX parser
SAXParser parser = saxParserFactory.newSAXParser();
// use it
// create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
// register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
parser.parse(xmlUri, xmlDocHandler);
((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
} catch (ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ", e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
} finally {
if (xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}
// End try
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class DocumentXmlUtils method toXml.
/**
* Returns a GateXml document that is a custom XML format for wich there is a
* reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
* serialize a GATE document in an XML format.
*
* @param doc the document to serialize.
* @return a string representing a Gate Xml document.
*/
public static String toXml(TextualDocument doc) {
// Initialize the xmlContent several time the size of the current document.
// This is because of the tags size. This measure is made to increase the
// performance of StringBuffer.
StringBuffer xmlContent = new StringBuffer(DOC_SIZE_MULTIPLICATION_FACTOR * (doc.getContent().size().intValue()));
// Add xml header
xmlContent.append("<?xml version=\"1.0\" encoding=\"");
xmlContent.append(doc.getEncoding());
xmlContent.append("\" ?>");
xmlContent.append(Strings.getNl());
// Add the root element
xmlContent.append("<GateDocument>\n");
xmlContent.append("<!-- The document's features-->\n\n");
xmlContent.append("<GateDocumentFeatures>\n");
xmlContent.append(featuresToXml(doc.getFeatures(), null));
xmlContent.append("</GateDocumentFeatures>\n");
xmlContent.append("<!-- The document content area with serialized" + " nodes -->\n\n");
// Add plain text element
xmlContent.append("<TextWithNodes>");
xmlContent.append(textWithNodes(doc, doc.getContent().toString()));
xmlContent.append("</TextWithNodes>\n");
// Serialize as XML all document's annotation sets
// Serialize the default AnnotationSet
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
sListener.statusChanged("Saving the default annotation set ");
xmlContent.append("<!-- The default annotation set -->\n\n");
annotationSetToXml(doc.getAnnotations(), xmlContent);
// Serialize all others AnnotationSets
// namedAnnotSets is a Map containing all other named Annotation Sets.
Map<String, AnnotationSet> namedAnnotSets = doc.getNamedAnnotationSets();
if (namedAnnotSets != null) {
Iterator<AnnotationSet> iter = namedAnnotSets.values().iterator();
while (iter.hasNext()) {
AnnotationSet annotSet = iter.next();
xmlContent.append("<!-- Named annotation set -->\n\n");
// Serialize it as XML
if (sListener != null)
sListener.statusChanged("Saving " + annotSet.getName() + " annotation set ");
annotationSetToXml(annotSet, xmlContent);
}
// End while
}
// End if
// Add the end of GateDocument
xmlContent.append("</GateDocument>");
if (sListener != null)
sListener.statusChanged("Done !");
// return the XmlGateDocument
return xmlContent.toString();
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class DocumentImpl method init.
/**
* Initialise this resource, and return it.
*/
@Override
public Resource init() throws ResourceInstantiationException {
// set up the source URL and create the content
if (sourceUrl == null) {
if (stringContent == null) {
throw new ResourceInstantiationException("The sourceURL and document's content were null.");
}
content = new DocumentContentImpl(stringContent);
getFeatures().put("gate.SourceURL", "created from String");
} else {
try {
content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
} catch (IOException e) {
throw new ResourceInstantiationException("DocumentImpl.init: " + e);
}
}
if (preserveOriginalContent.booleanValue() && content != null) {
String originalContent = ((DocumentContentImpl) content).getOriginalContent();
getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
}
// set up a DocumentFormat if markup unpacking required
if (getMarkupAware().booleanValue()) {
DocumentFormat docFormat = null;
// if a specific MIME type has been given, use it
if (this.mimeType != null && this.mimeType.length() > 0) {
MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
if (theType == null) {
throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
}
docFormat = DocumentFormat.getDocumentFormat(this, theType);
} else {
docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
}
try {
if (docFormat != null) {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
docFormat.addStatusListener(sListener);
// set the flag if true and if the document format support collecting
docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
if (docFormat.getShouldCollectRepositioning().booleanValue()) {
// unpack with collectiong of repositioning information
RepositioningInfo info = new RepositioningInfo();
String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
RepositioningInfo ampCodingInfo = new RepositioningInfo();
if (origContent != null) {
boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
collectInformationForWS(origContent, ampCodingInfo);
}
// if
}
// if
docFormat.unpackMarkup(this, info, ampCodingInfo);
if (origContent != null && docFormat instanceof XmlDocumentFormat) {
// CRLF correction of RepositioningInfo
correctRepositioningForCRLFInXML(origContent, info);
}
// if
getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
} else {
// normal old fashioned unpack
docFormat.unpackMarkup(this);
}
docFormat.removeStatusListener(sListener);
}
// if format != null
} catch (DocumentFormatException e) {
throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
}
}
// }
return this;
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class DocumentImpl method saveAnnotationSetAsXmlInOrig.
// hasOriginalContentFeatures
/**
* This method saves all the annotations from aDumpAnnotSet and combines them
* with the original document content, if preserved as feature.
*
* @param aSourceAnnotationSet
* is a GATE annotation set prepared to be used on the raw text from
* document content. If aDumpAnnotSet is <b>null<b> then an empty
* string will be returned.
* @param includeFeatures
* is a boolean, which controls whether the annotation features and
* gate ID are included or not.
* @return The XML document obtained from raw text + the information from the
* dump annotation set.
*/
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
StringBuffer docContStrBuff;
String origContent;
origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
if (origContent == null) {
origContent = "";
}
// if
long originalContentSize = origContent.length();
RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
docContStrBuff = new StringBuffer(origContent);
if (aSourceAnnotationSet == null)
return docContStrBuff.toString();
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
AnnotationSet dumpingSet = new AnnotationSetImpl(this);
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// Then take all the annotations from aSourceAnnotationSet and verify if
// they can be inserted safely into the dumpingSet. Where not possible,
// report.
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
Annotation currentAnnot;
while (iter.hasNext()) {
currentAnnot = iter.next();
if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
dumpingSet.add(currentAnnot);
} else {
Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
}
// End if
}
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
// /////////////////////////////////////////
// Construct a set of annot with all IDs in asc order.
// All annotations that end at that offset swap their place in descending
// order. For each node write all the tags from left to right.
// Construct the node set
TreeSet<Long> offsets = new TreeSet<Long>();
iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation annot = iter.next();
offsets.add(annot.getStartNode().getOffset());
offsets.add(annot.getEndNode().getOffset());
}
// iteration
while (!offsets.isEmpty()) {
Long offset = offsets.last();
// Remove the offset from the set
offsets.remove(offset);
// Now, use it.
// Returns a list with annotations that needs to be serialized in that
// offset.
List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
// Attention: the annotation are serialized from left to right
StringBuffer tmpBuff = new StringBuffer("");
Stack<Annotation> stack = new Stack<Annotation>();
// Iterate through all these annotations and serialize them
Iterator<Annotation> it = annotations.iterator();
Annotation a = null;
while (it.hasNext()) {
a = it.next();
it.remove();
// Test if a Ends at offset
if (offset.equals(a.getEndNode().getOffset())) {
// Test if a Starts at offset
if (offset.equals(a.getStartNode().getOffset())) {
// Here, the annotation a Starts and Ends at the offset
if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
// Assert: annotation a with start == end and isEmptyAndSpan
tmpBuff.append(writeStartTag(a, includeFeatures, false));
stack.push(a);
} else {
// Assert annotation a with start == end and an empty tag
tmpBuff.append(writeEmptyTag(a, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if
} else {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeEndTag(a));
}
// End if
} else {
// at the offset
if (offset.equals(a.getStartNode().getOffset())) {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeStartTag(a, includeFeatures, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if ( offset.equals(a.getStartNode().getOffset()) )
}
// End if ( offset.equals(a.getEndNode().getOffset()) )
}
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
long originalPosition = -1;
boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
if (backPositioning) {
// end of the annotation correction
originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
}
// if
if (originalPosition == -1) {
originalPosition = repositioning.getOriginalPos(offset.intValue());
}
// Insert tmpBuff to the location where it belongs in docContStrBuff
if (originalPosition != -1 && originalPosition <= originalContentSize) {
docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
} else {
Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
}
// if
}
// End while(!offsets.isEmpty())
if (theRootAnnotation != null)
docContStrBuff.append(writeEndTag(theRootAnnotation));
return docContStrBuff.toString();
}
use of gate.event.StatusListener in project gate-core by GateNLP.
the class DocumentImpl method toXml.
/**
* Returns an XML document aming to preserve the original markups( the
* original markup will be in the same place and format as it was before
* processing the document) and include (if possible) the annotations
* specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
* the aSourceAnnotationSet will be lost if they will cause a crosed over
* situation.
*
* @param aSourceAnnotationSet
* is an annotation set containing all the annotations that will be
* combined with the original marup set. If the param is
* <code>null</code> it will only dump the original markups.
* @param includeFeatures
* is a boolean that controls whether the annotation features should
* be included or not. If false, only the annotation type is included
* in the tag.
* @return a string representing an XML document containing the original
* markup + dumped annotations form the aSourceAnnotationSet
*/
@Override
@SuppressWarnings("unused")
public String toXml(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
if (hasOriginalContentFeatures()) {
return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet, // if
includeFeatures);
}
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
// AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
List<Annotation> dumpingList = new ArrayList<Annotation>(originalMarkupsAnnotSet.size());
// This set will be constructed inside this method. If is not empty, the
// annotation contained will be lost.
/*
* if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
* was not empty."+ "All annotation it contained were lost.");
* dumpingSet.clear(); }// End if
*/
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// First add all annotation from the original markups
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// dumpingSet.addAll(originalMarkupsAnnotSet);
dumpingList.addAll(originalMarkupsAnnotSet);
// report.
if (aSourceAnnotationSet != null) {
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation currentAnnot = iter.next();
if (insertsSafety(dumpingList, currentAnnot)) {
// dumpingSet.add(currentAnnot);
dumpingList.add(currentAnnot);
} else if (crossedOverAnnotation != null && DEBUG) {
try {
Out.prln("Warning: Annotations were found to violate the " + "crossed over condition: \n" + "1. [" + getContent().getContent(crossedOverAnnotation.getStartNode().getOffset(), crossedOverAnnotation.getEndNode().getOffset()) + " (" + crossedOverAnnotation.getType() + ": " + crossedOverAnnotation.getStartNode().getOffset() + ";" + crossedOverAnnotation.getEndNode().getOffset() + ")]\n" + "2. [" + getContent().getContent(currentAnnot.getStartNode().getOffset(), currentAnnot.getEndNode().getOffset()) + " (" + currentAnnot.getType() + ": " + currentAnnot.getStartNode().getOffset() + ";" + currentAnnot.getEndNode().getOffset() + ")]\nThe second one will be discarded.\n");
} catch (gate.util.InvalidOffsetException ex) {
throw new GateRuntimeException(ex.getMessage());
}
}
// End if
}
// End while
}
// End if
// kalina: order the dumping list by start offset
Collections.sort(dumpingList, new gate.util.OffsetComparator());
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
StringBuffer xmlDoc = new StringBuffer(DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR * (this.getContent().size().intValue()));
// Add xml header if original format was xml
String mimeType = (String) getFeatures().get("MimeType");
boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
if (wasXML) {
xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
xmlDoc.append(getEncoding());
xmlDoc.append("\" ?>");
xmlDoc.append(Strings.getNl());
}
// ENd if
// Identify and extract the root annotation from the dumpingSet.
theRootAnnotation = identifyTheRootAnnotation(dumpingList);
// beginning of the document
if (theRootAnnotation != null) {
dumpingList.remove(theRootAnnotation);
xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
}
// End if
// Construct and append the rest of the document
xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
// end of the document
if (theRootAnnotation != null) {
xmlDoc.append(writeEndTag(theRootAnnotation));
}
// End if
if (sListener != null)
sListener.statusChanged("Done.");
return xmlDoc.toString();
}
Aggregations