Search in sources :

Example 1 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackMarkup.

// unpackMarkup
/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. XML) into annotations in GATE format. Uses the
 * markupElementsMap to determine which elements to convert, and what
 * annotation type names to use. If the document was created from a
 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
 * So, if the document has a valid URL, then the parser will try to
 * parse the XML document pointed by the URL.If the URL is not valid,
 * or is null, then the doc's content will be parsed. If the doc's
 * content is not a valid XML then the parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the XML document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // determine whether we have a GATE format XML document or another
    // kind
    String content = doc.getContent().toString();
    if (content.length() > 2048) {
        content = content.substring(0, 2048);
    }
    boolean gateFormat = isGateXmlFormat(content);
    if (gateFormat) {
        unpackGateFormatMarkup(doc, statusListener);
    } else {
        unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
    }
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) StatusListener(gate.event.StatusListener)

Example 2 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class PersistenceManager method getPersistentRepresentation.

/**
 * Recursively traverses the provided object and replaces it and all
 * its contents with the appropriate persistent equivalent classes.
 *
 * @param target the object to be analysed and translated into a
 *          persistent equivalent.
 * @return the persistent equivalent value for the provided target
 */
public static Serializable getPersistentRepresentation(Object target) throws PersistenceException {
    if (target == null)
        return null;
    // first check we don't have it already
    Persistence res = existingPersistentReplacements.get().getFirst().get(new ObjectHolder(target));
    if (res != null)
        return res;
    Class<? extends Object> type = target.getClass();
    Class<?> newType = getMostSpecificPersistentType(type);
    if (newType == null) {
        // no special handler
        if (target instanceof Serializable)
            return (Serializable) target;
        else
            throw new PersistenceException("Could not find a serialisable replacement for " + type);
    }
    // we have a new type; create the new object, populate and return it
    try {
        res = (Persistence) newType.newInstance();
    } catch (Exception e) {
        throw new PersistenceException(e);
    }
    if (target instanceof NameBearer) {
        StatusListener sListener = (StatusListener) Gate.getListeners().get("gate.event.StatusListener");
        if (sListener != null) {
            sListener.statusChanged("Storing " + ((NameBearer) target).getName());
        }
    }
    res.extractDataFromSource(target);
    existingPersistentReplacements.get().getFirst().put(new ObjectHolder(target), res);
    return res;
}
Also used : Serializable(java.io.Serializable) PersistenceException(gate.persist.PersistenceException) StatusListener(gate.event.StatusListener) NameBearer(gate.util.NameBearer) URISyntaxException(java.net.URISyntaxException) XMLStreamException(javax.xml.stream.XMLStreamException) PersistenceException(gate.persist.PersistenceException) GateRuntimeException(gate.util.GateRuntimeException) ResourceInstantiationException(gate.creole.ResourceInstantiationException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) GateException(gate.util.GateException)

Example 3 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class TikaFormat method unpackMarkup.

@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if (doc == null || doc.getSourceUrl() == null) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    Metadata metadata = extractParserTips(doc);
    ch.addStatusListener(statusListener);
    ch.setRepositioningInfo(repInfo);
    // set the object with ampersand coding positions
    ch.setAmpCodingInfo(ampCodingInfo);
    InputStream input = null;
    try {
        Parser tikaParser = new TikaConfig().getParser();
        input = doc.getSourceUrl().openStream();
        tikaParser.parse(input, ch, metadata, new ParseContext());
        setDocumentFeatures(metadata, doc);
    } catch (IOException e) {
        throw new DocumentFormatException(e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (TikaException e) {
        throw new DocumentFormatException(e);
    } finally {
        // null safe
        IOUtils.closeQuietly(input);
        ch.removeStatusListener(statusListener);
    }
    if (doc instanceof DocumentImpl) {
        ((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) ParseContext(org.apache.tika.parser.ParseContext) StatusListener(gate.event.StatusListener)

Example 4 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class DocumentStaxUtils method writeDocument.

/**
 * Write the specified GATE Document to an XMLStreamWriter. This
 * method writes just the GateDocument element - the XML declaration
 * must be filled in by the caller if required.
 *
 * @param doc the Document to write
 * @param annotationSets the annotations to include. If the map
 *          contains an entry for the key <code>null</code>, this
 *          will be treated as the default set. All other entries are
 *          treated as named annotation sets.
 * @param xsw the StAX XMLStreamWriter to use for output
 * @throws GateException if an error occurs during writing
 */
public static void writeDocument(Document doc, Map<String, Collection<Annotation>> annotationSets, XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
    xsw.setDefaultNamespace(namespaceURI);
    xsw.writeStartElement(namespaceURI, "GateDocument");
    xsw.writeAttribute("version", GATE_XML_VERSION);
    if (namespaceURI.length() > 0) {
        xsw.writeDefaultNamespace(namespaceURI);
    }
    newLine(xsw);
    // features
    xsw.writeComment(" The document's features");
    newLine(xsw);
    newLine(xsw);
    xsw.writeStartElement(namespaceURI, "GateDocumentFeatures");
    newLine(xsw);
    writeFeatures(doc.getFeatures(), xsw, namespaceURI);
    // GateDocumentFeatures
    xsw.writeEndElement();
    newLine(xsw);
    // text with nodes
    xsw.writeComment(" The document content area with serialized nodes ");
    newLine(xsw);
    newLine(xsw);
    writeTextWithNodes(doc, annotationSets.values(), xsw, namespaceURI);
    newLine(xsw);
    // Serialize as XML all document's annotation sets
    // Serialize the default AnnotationSet
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    if (annotationSets.containsKey(null)) {
        if (sListener != null)
            sListener.statusChanged("Saving the default annotation set ");
        xsw.writeComment(" The default annotation set ");
        newLine(xsw);
        newLine(xsw);
        writeAnnotationSet(annotationSets.get(null), null, xsw, namespaceURI);
        newLine(xsw);
    }
    // while(iter.hasNext()) {
    for (Map.Entry<String, Collection<Annotation>> entry : annotationSets.entrySet()) {
        // iter.next();
        String annotationSetName = entry.getKey();
        // above
        if (annotationSetName != null) {
            // annotationSets.get(annotationSetName);
            Collection<Annotation> annots = entry.getValue();
            xsw.writeComment(" Named annotation set ");
            newLine(xsw);
            newLine(xsw);
            // Serialize it as XML
            if (sListener != null)
                sListener.statusChanged("Saving " + annotationSetName + " annotation set ");
            writeAnnotationSet(annots, annotationSetName, xsw, namespaceURI);
            newLine(xsw);
        }
    // End if
    }
    // End while
    Iterator<String> iter = annotationSets.keySet().iterator();
    while (iter.hasNext()) {
        writeRelationSet(doc.getAnnotations(iter.next()).getRelations(), xsw, namespaceURI);
    }
    // close the GateDocument element
    xsw.writeEndElement();
    newLine(xsw);
}
Also used : Collection(java.util.Collection) StatusListener(gate.event.StatusListener) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) Annotation(gate.Annotation)

Example 5 with StatusListener

use of gate.event.StatusListener in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        handler.addStatusListener(statusListener);
        // set repositioning object
        handler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        handler.setAmpCodingInfo(ampCodingInfo);
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        handler.setLineOffsets(lineOffsets);
        // set the handlers
        parser.setDocumentHandler(handler);
        parser.setErrorHandler(handler);
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            }
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        }
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        parser.parse(is);
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } finally {
        if (handler != null)
            handler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputStreamReader(java.io.InputStreamReader) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException(java.io.IOException) URLConnection(java.net.URLConnection) ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException(java.io.IOException) DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) StatusListener(gate.event.StatusListener)

Aggregations

StatusListener (gate.event.StatusListener)15 IOException (java.io.IOException)7 DocumentFormatException (gate.util.DocumentFormatException)6 Annotation (gate.Annotation)4 AnnotationSet (gate.AnnotationSet)4 ResourceInstantiationException (gate.creole.ResourceInstantiationException)4 FeatureMap (gate.FeatureMap)3 GateRuntimeException (gate.util.GateRuntimeException)3 InputStream (java.io.InputStream)3 XStream (com.thoughtworks.xstream.XStream)2 StaxDriver (com.thoughtworks.xstream.io.xml.StaxDriver)2 XStream11NameCoder (com.thoughtworks.xstream.io.xml.XStream11NameCoder)2 Document (gate.Document)2 ProgressListener (gate.event.ProgressListener)2 PersistenceException (gate.persist.PersistenceException)2 BomStrippingInputStreamReader (gate.util.BomStrippingInputStreamReader)2 GateException (gate.util.GateException)2 InvalidOffsetException (gate.util.InvalidOffsetException)2 XmlDocumentHandler (gate.xml.XmlDocumentHandler)2 BufferedReader (java.io.BufferedReader)2