Search in sources :

Example 1 with DocumentFormat

use of gate.DocumentFormat in project gate-core by GateNLP.

the class DocumentImpl method init.

/**
 * Initialise this resource, and return it.
 */
@Override
public Resource init() throws ResourceInstantiationException {
    // set up the source URL and create the content
    if (sourceUrl == null) {
        if (stringContent == null) {
            throw new ResourceInstantiationException("The sourceURL and document's content were null.");
        }
        content = new DocumentContentImpl(stringContent);
        getFeatures().put("gate.SourceURL", "created from String");
    } else {
        try {
            content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
            getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
        } catch (IOException e) {
            throw new ResourceInstantiationException("DocumentImpl.init: " + e);
        }
    }
    if (preserveOriginalContent.booleanValue() && content != null) {
        String originalContent = ((DocumentContentImpl) content).getOriginalContent();
        getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
    }
    // set up a DocumentFormat if markup unpacking required
    if (getMarkupAware().booleanValue()) {
        DocumentFormat docFormat = null;
        // if a specific MIME type has been given, use it
        if (this.mimeType != null && this.mimeType.length() > 0) {
            MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
            if (theType == null) {
                throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
            }
            docFormat = DocumentFormat.getDocumentFormat(this, theType);
        } else {
            docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
        }
        try {
            if (docFormat != null) {
                StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
                if (sListener != null)
                    docFormat.addStatusListener(sListener);
                // set the flag if true and if the document format support collecting
                docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
                if (docFormat.getShouldCollectRepositioning().booleanValue()) {
                    // unpack with collectiong of repositioning information
                    RepositioningInfo info = new RepositioningInfo();
                    String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
                    RepositioningInfo ampCodingInfo = new RepositioningInfo();
                    if (origContent != null) {
                        boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
                        collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
                        if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
                            collectInformationForWS(origContent, ampCodingInfo);
                        }
                    // if
                    }
                    // if
                    docFormat.unpackMarkup(this, info, ampCodingInfo);
                    if (origContent != null && docFormat instanceof XmlDocumentFormat) {
                        // CRLF correction of RepositioningInfo
                        correctRepositioningForCRLFInXML(origContent, info);
                    }
                    // if
                    getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
                } else {
                    // normal old fashioned unpack
                    docFormat.unpackMarkup(this);
                }
                docFormat.removeStatusListener(sListener);
            }
        // if format != null
        } catch (DocumentFormatException e) {
            throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
        }
    }
    // }
    return this;
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) DocumentFormat(gate.DocumentFormat) IOException(java.io.IOException) StatusListener(gate.event.StatusListener) ResourceInstantiationException(gate.creole.ResourceInstantiationException)

Aggregations

DocumentFormat (gate.DocumentFormat)1 ResourceInstantiationException (gate.creole.ResourceInstantiationException)1 StatusListener (gate.event.StatusListener)1 DocumentFormatException (gate.util.DocumentFormatException)1 IOException (java.io.IOException)1