Search in sources :

Example 6 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class SgmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. SGML) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * The doc's content is first converted to a wel formed XML.
 * If this succeddes then the document is saved into a temp file and parsed
 * as an XML document.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler xmlDocHandler = null;
    try {
        Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
        fireStatusChanged("Performing SGML to XML...");
        // convert the SGML document
        String xmlUri = sgml2Xml.convert();
        fireStatusChanged("DONE !");
        // Out.println("Conversion done..." + xmlUri);
        // Out.println(sgml2Xml.convert());
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // Create a SAX parser
        SAXParser parser = saxParserFactory.newSAXParser();
        // use it
        // create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        parser.parse(xmlUri, xmlDocHandler);
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) Sgml2Xml(gate.sgml.Sgml2Xml) XmlDocumentHandler(gate.xml.XmlDocumentHandler) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Example 7 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class DocumentXmlUtils method toXml.

/**
 * Returns a GateXml document that is a custom XML format for wich there is a
 * reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
 * serialize a GATE document in an XML format.
 *
 * @param doc the document to serialize.
 * @return a string representing a Gate Xml document.
 */
public static String toXml(TextualDocument doc) {
    // Initialize the xmlContent several time the size of the current document.
    // This is because of the tags size. This measure is made to increase the
    // performance of StringBuffer.
    StringBuffer xmlContent = new StringBuffer(DOC_SIZE_MULTIPLICATION_FACTOR * (doc.getContent().size().intValue()));
    // Add xml header
    xmlContent.append("<?xml version=\"1.0\" encoding=\"");
    xmlContent.append(doc.getEncoding());
    xmlContent.append("\" ?>");
    xmlContent.append(Strings.getNl());
    // Add the root element
    xmlContent.append("<GateDocument>\n");
    xmlContent.append("<!-- The document's features-->\n\n");
    xmlContent.append("<GateDocumentFeatures>\n");
    xmlContent.append(featuresToXml(doc.getFeatures(), null));
    xmlContent.append("</GateDocumentFeatures>\n");
    xmlContent.append("<!-- The document content area with serialized" + " nodes -->\n\n");
    // Add plain text element
    xmlContent.append("<TextWithNodes>");
    xmlContent.append(textWithNodes(doc, doc.getContent().toString()));
    xmlContent.append("</TextWithNodes>\n");
    // Serialize as XML all document's annotation sets
    // Serialize the default AnnotationSet
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    if (sListener != null)
        sListener.statusChanged("Saving the default annotation set ");
    xmlContent.append("<!-- The default annotation set -->\n\n");
    annotationSetToXml(doc.getAnnotations(), xmlContent);
    // Serialize all others AnnotationSets
    // namedAnnotSets is a Map containing all other named Annotation Sets.
    Map<String, AnnotationSet> namedAnnotSets = doc.getNamedAnnotationSets();
    if (namedAnnotSets != null) {
        Iterator<AnnotationSet> iter = namedAnnotSets.values().iterator();
        while (iter.hasNext()) {
            AnnotationSet annotSet = iter.next();
            xmlContent.append("<!-- Named annotation set -->\n\n");
            // Serialize it as XML
            if (sListener != null)
                sListener.statusChanged("Saving " + annotSet.getName() + " annotation set ");
            annotationSetToXml(annotSet, xmlContent);
        }
    // End while
    }
    // End if
    // Add the end of GateDocument
    xmlContent.append("</GateDocument>");
    if (sListener != null)
        sListener.statusChanged("Done !");
    // return the XmlGateDocument
    return xmlContent.toString();
}
Also used : AnnotationSet(gate.AnnotationSet) StatusListener(gate.event.StatusListener)

Example 8 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class DocumentImpl method init.

/**
 * Initialise this resource, and return it.
 */
@Override
public Resource init() throws ResourceInstantiationException {
    // set up the source URL and create the content
    if (sourceUrl == null) {
        if (stringContent == null) {
            throw new ResourceInstantiationException("The sourceURL and document's content were null.");
        }
        content = new DocumentContentImpl(stringContent);
        getFeatures().put("gate.SourceURL", "created from String");
    } else {
        try {
            content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
            getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
        } catch (IOException e) {
            throw new ResourceInstantiationException("DocumentImpl.init: " + e);
        }
    }
    if (preserveOriginalContent.booleanValue() && content != null) {
        String originalContent = ((DocumentContentImpl) content).getOriginalContent();
        getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
    }
    // set up a DocumentFormat if markup unpacking required
    if (getMarkupAware().booleanValue()) {
        DocumentFormat docFormat = null;
        // if a specific MIME type has been given, use it
        if (this.mimeType != null && this.mimeType.length() > 0) {
            MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
            if (theType == null) {
                throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
            }
            docFormat = DocumentFormat.getDocumentFormat(this, theType);
        } else {
            docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
        }
        try {
            if (docFormat != null) {
                StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
                if (sListener != null)
                    docFormat.addStatusListener(sListener);
                // set the flag if true and if the document format support collecting
                docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
                if (docFormat.getShouldCollectRepositioning().booleanValue()) {
                    // unpack with collectiong of repositioning information
                    RepositioningInfo info = new RepositioningInfo();
                    String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
                    RepositioningInfo ampCodingInfo = new RepositioningInfo();
                    if (origContent != null) {
                        boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
                        collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
                        if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
                            collectInformationForWS(origContent, ampCodingInfo);
                        }
                    // if
                    }
                    // if
                    docFormat.unpackMarkup(this, info, ampCodingInfo);
                    if (origContent != null && docFormat instanceof XmlDocumentFormat) {
                        // CRLF correction of RepositioningInfo
                        correctRepositioningForCRLFInXML(origContent, info);
                    }
                    // if
                    getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
                } else {
                    // normal old fashioned unpack
                    docFormat.unpackMarkup(this);
                }
                docFormat.removeStatusListener(sListener);
            }
        // if format != null
        } catch (DocumentFormatException e) {
            throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
        }
    }
    // }
    return this;
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) DocumentFormat(gate.DocumentFormat) IOException(java.io.IOException) StatusListener(gate.event.StatusListener) ResourceInstantiationException(gate.creole.ResourceInstantiationException)

Example 9 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class DocumentImpl method saveAnnotationSetAsXmlInOrig.

// hasOriginalContentFeatures
/**
 * This method saves all the annotations from aDumpAnnotSet and combines them
 * with the original document content, if preserved as feature.
 *
 * @param aSourceAnnotationSet
 *          is a GATE annotation set prepared to be used on the raw text from
 *          document content. If aDumpAnnotSet is <b>null<b> then an empty
 *          string will be returned.
 * @param includeFeatures
 *          is a boolean, which controls whether the annotation features and
 *          gate ID are included or not.
 * @return The XML document obtained from raw text + the information from the
 *         dump annotation set.
 */
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
    StringBuffer docContStrBuff;
    String origContent;
    origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
    if (origContent == null) {
        origContent = "";
    }
    // if
    long originalContentSize = origContent.length();
    RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
    docContStrBuff = new StringBuffer(origContent);
    if (aSourceAnnotationSet == null)
        return docContStrBuff.toString();
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // Create a dumping annotation set on the document. It will be used for
    // dumping annotations...
    AnnotationSet dumpingSet = new AnnotationSetImpl(this);
    if (sListener != null)
        sListener.statusChanged("Constructing the dumping annotation set.");
    // Then take all the annotations from aSourceAnnotationSet and verify if
    // they can be inserted safely into the dumpingSet. Where not possible,
    // report.
    Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
    Annotation currentAnnot;
    while (iter.hasNext()) {
        currentAnnot = iter.next();
        if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
            dumpingSet.add(currentAnnot);
        } else {
            Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
        }
    // End if
    }
    // Here we go.
    if (sListener != null)
        sListener.statusChanged("Dumping annotations as XML");
    // /////////////////////////////////////////
    // Construct a set of annot with all IDs in asc order.
    // All annotations that end at that offset swap their place in descending
    // order. For each node write all the tags from left to right.
    // Construct the node set
    TreeSet<Long> offsets = new TreeSet<Long>();
    iter = aSourceAnnotationSet.iterator();
    while (iter.hasNext()) {
        Annotation annot = iter.next();
        offsets.add(annot.getStartNode().getOffset());
        offsets.add(annot.getEndNode().getOffset());
    }
    // iteration
    while (!offsets.isEmpty()) {
        Long offset = offsets.last();
        // Remove the offset from the set
        offsets.remove(offset);
        // Now, use it.
        // Returns a list with annotations that needs to be serialized in that
        // offset.
        List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
        // Attention: the annotation are serialized from left to right
        StringBuffer tmpBuff = new StringBuffer("");
        Stack<Annotation> stack = new Stack<Annotation>();
        // Iterate through all these annotations and serialize them
        Iterator<Annotation> it = annotations.iterator();
        Annotation a = null;
        while (it.hasNext()) {
            a = it.next();
            it.remove();
            // Test if a Ends at offset
            if (offset.equals(a.getEndNode().getOffset())) {
                // Test if a Starts at offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // Here, the annotation a Starts and Ends at the offset
                    if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
                        // Assert: annotation a with start == end and isEmptyAndSpan
                        tmpBuff.append(writeStartTag(a, includeFeatures, false));
                        stack.push(a);
                    } else {
                        // Assert annotation a with start == end and an empty tag
                        tmpBuff.append(writeEmptyTag(a, false));
                        // The annotation is removed from dumped set
                        aSourceAnnotationSet.remove(a);
                    }
                // End if
                } else {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeEndTag(a));
                }
            // End if
            } else {
                // at the offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeStartTag(a, includeFeatures, false));
                    // The annotation is removed from dumped set
                    aSourceAnnotationSet.remove(a);
                }
            // End if ( offset.equals(a.getStartNode().getOffset()) )
            }
        // End if ( offset.equals(a.getEndNode().getOffset()) )
        }
        // In this case empty the stack and write the end tag
        while (!stack.isEmpty()) {
            Annotation a1 = stack.pop();
            tmpBuff.append(writeEndTag(a1));
        }
        // End while
        long originalPosition = -1;
        boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
        if (backPositioning) {
            // end of the annotation correction
            originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
        }
        // if
        if (originalPosition == -1) {
            originalPosition = repositioning.getOriginalPos(offset.intValue());
        }
        // Insert tmpBuff to the location where it belongs in docContStrBuff
        if (originalPosition != -1 && originalPosition <= originalContentSize) {
            docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
        } else {
            Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
        }
    // if
    }
    // End while(!offsets.isEmpty())
    if (theRootAnnotation != null)
        docContStrBuff.append(writeEndTag(theRootAnnotation));
    return docContStrBuff.toString();
}
Also used : AnnotationSet(gate.AnnotationSet) Annotation(gate.Annotation) Stack(java.util.Stack) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) TreeSet(java.util.TreeSet) StatusListener(gate.event.StatusListener)

Example 10 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class DocumentImpl method toXml.

/**
 * Returns an XML document aming to preserve the original markups( the
 * original markup will be in the same place and format as it was before
 * processing the document) and include (if possible) the annotations
 * specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
 * the aSourceAnnotationSet will be lost if they will cause a crosed over
 * situation.
 *
 * @param aSourceAnnotationSet
 *          is an annotation set containing all the annotations that will be
 *          combined with the original marup set. If the param is
 *          <code>null</code> it will only dump the original markups.
 * @param includeFeatures
 *          is a boolean that controls whether the annotation features should
 *          be included or not. If false, only the annotation type is included
 *          in the tag.
 * @return a string representing an XML document containing the original
 *         markup + dumped annotations form the aSourceAnnotationSet
 */
@Override
@SuppressWarnings("unused")
public String toXml(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
    if (hasOriginalContentFeatures()) {
        return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet, // if
        includeFeatures);
    }
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // Create a dumping annotation set on the document. It will be used for
    // dumping annotations...
    // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
    List<Annotation> dumpingList = new ArrayList<Annotation>(originalMarkupsAnnotSet.size());
    // This set will be constructed inside this method. If is not empty, the
    // annotation contained will be lost.
    /*
     * if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
     * was not empty."+ "All annotation it contained were lost.");
     * dumpingSet.clear(); }// End if
     */
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // First add all annotation from the original markups
    if (sListener != null)
        sListener.statusChanged("Constructing the dumping annotation set.");
    // dumpingSet.addAll(originalMarkupsAnnotSet);
    dumpingList.addAll(originalMarkupsAnnotSet);
    // report.
    if (aSourceAnnotationSet != null) {
        Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
        while (iter.hasNext()) {
            Annotation currentAnnot = iter.next();
            if (insertsSafety(dumpingList, currentAnnot)) {
                // dumpingSet.add(currentAnnot);
                dumpingList.add(currentAnnot);
            } else if (crossedOverAnnotation != null && DEBUG) {
                try {
                    Out.prln("Warning: Annotations were found to violate the " + "crossed over condition: \n" + "1. [" + getContent().getContent(crossedOverAnnotation.getStartNode().getOffset(), crossedOverAnnotation.getEndNode().getOffset()) + " (" + crossedOverAnnotation.getType() + ": " + crossedOverAnnotation.getStartNode().getOffset() + ";" + crossedOverAnnotation.getEndNode().getOffset() + ")]\n" + "2. [" + getContent().getContent(currentAnnot.getStartNode().getOffset(), currentAnnot.getEndNode().getOffset()) + " (" + currentAnnot.getType() + ": " + currentAnnot.getStartNode().getOffset() + ";" + currentAnnot.getEndNode().getOffset() + ")]\nThe second one will be discarded.\n");
                } catch (gate.util.InvalidOffsetException ex) {
                    throw new GateRuntimeException(ex.getMessage());
                }
            }
        // End if
        }
    // End while
    }
    // End if
    // kalina: order the dumping list by start offset
    Collections.sort(dumpingList, new gate.util.OffsetComparator());
    // Here we go.
    if (sListener != null)
        sListener.statusChanged("Dumping annotations as XML");
    StringBuffer xmlDoc = new StringBuffer(DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR * (this.getContent().size().intValue()));
    // Add xml header if original format was xml
    String mimeType = (String) getFeatures().get("MimeType");
    boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
    if (wasXML) {
        xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
        xmlDoc.append(getEncoding());
        xmlDoc.append("\" ?>");
        xmlDoc.append(Strings.getNl());
    }
    // ENd if
    // Identify and extract the root annotation from the dumpingSet.
    theRootAnnotation = identifyTheRootAnnotation(dumpingList);
    // beginning of the document
    if (theRootAnnotation != null) {
        dumpingList.remove(theRootAnnotation);
        xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
    }
    // End if
    // Construct and append the rest of the document
    xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
    // end of the document
    if (theRootAnnotation != null) {
        xmlDoc.append(writeEndTag(theRootAnnotation));
    }
    // End if
    if (sListener != null)
        sListener.statusChanged("Done.");
    return xmlDoc.toString();
}
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) GateRuntimeException(gate.util.GateRuntimeException) StatusListener(gate.event.StatusListener)

Aggregations

StatusListener (gate.event.StatusListener)15 IOException (java.io.IOException)7 DocumentFormatException (gate.util.DocumentFormatException)6 Annotation (gate.Annotation)4 AnnotationSet (gate.AnnotationSet)4 ResourceInstantiationException (gate.creole.ResourceInstantiationException)4 FeatureMap (gate.FeatureMap)3 GateRuntimeException (gate.util.GateRuntimeException)3 InputStream (java.io.InputStream)3 XStream (com.thoughtworks.xstream.XStream)2 StaxDriver (com.thoughtworks.xstream.io.xml.StaxDriver)2 XStream11NameCoder (com.thoughtworks.xstream.io.xml.XStream11NameCoder)2 Document (gate.Document)2 ProgressListener (gate.event.ProgressListener)2 PersistenceException (gate.persist.PersistenceException)2 BomStrippingInputStreamReader (gate.util.BomStrippingInputStreamReader)2 GateException (gate.util.GateException)2 InvalidOffsetException (gate.util.InvalidOffsetException)2 XmlDocumentHandler (gate.xml.XmlDocumentHandler)2 BufferedReader (java.io.BufferedReader)2