Search in sources :

Example 6 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class Annotandum method unpackMarkup.

@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    setNewLineProperty(doc);
    String[] lines = doc.getContent().toString().split("[\\n\\r]+");
    StringBuilder newContent = new StringBuilder();
    // Items of data to be turned into Original markups annotations
    List<Annotandum> annotanda = new ArrayList<Annotandum>();
    // Currently open tags: created by "B-FOO", extended by "I-FOO", closed
    // by "O" or end of sentence.
    Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
    /* Note: I-Foo handling currently has a weak spot.
     * 
     * this    B-Foo
     * is      B-Bar
     * strange I-Foo
     * 
     * will result in a Foo annotation spanning "this is strange", because
     * the I-Foo extends the existing B-Foo.  If the sentence is cut off 
     * before hitting another I-Foo, however, the Foo annotation will not
     * have been extended.  But this situation will not occur in carefully
     * edited input.  
     */
    long oldEnd = 0L;
    long start = 0L;
    long end = 0L;
    for (String line : lines) {
        oldEnd = end;
        start = newContent.length();
        String[] items = line.split("\\s+");
        // any annotations in progress
        if (items.length == 0) {
            newContent.append("\n");
            end = newContent.length();
            finishAllTags(inProgress, annotanda, oldEnd);
        } else {
            String token = items[0];
            // We've agreed to put the space after every token.
            newContent.append(token);
            end = newContent.length();
            newContent.append(' ');
            // Create Token and following SpaceToken annotation.
            annotanda.add(Annotandum.makeToken(start, end, token));
            annotanda.add(Annotandum.makeSpaceToken(end));
            for (int column = 1; column < items.length; column++) {
                // O means close all annotations in progress
                if (items[column].equals("O")) {
                    finishAllTags(inProgress, annotanda, oldEnd);
                } else // annotation, after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("U-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    annotanda.add(new Annotandum(type, start, end, column, true));
                } else // close any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("L-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good L-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    }
                    finishTag(type, inProgress, annotanda, end);
                } else // after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("B-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "I-FOO": extend current "FOO" annotation
                if ((items[column].length() > 2) && items[column].startsWith("I-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good I-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    }
                } else // "FOO": treat as single-token annotation (such as POS tag)
                {
                    Annotandum tag = new Annotandum(items[column], start, end, column, false);
                    annotanda.add(tag);
                }
            }
        }
    }
    // end of input: close any remaining annotations
    finishAllTags(inProgress, annotanda, end);
    // set new content & create Original markups annotations
    try {
        DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
        doc.edit(0L, doc.getContent().size(), newContentImpl);
        long newSize = doc.getContent().size();
        AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        for (Annotandum ann : annotanda) {
            if (DEBUG) {
                String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
                System.out.format("%d  %d  %s  %s\n", ann.startOffset, ann.endOffset, ann.type, string);
            }
            originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
        }
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    }
}
Also used : InvalidOffsetException(gate.util.InvalidOffsetException) DocumentFormatException(gate.util.DocumentFormatException)

Example 7 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class DocumentImpl method init.

/**
 * Initialise this resource, and return it.
 */
@Override
public Resource init() throws ResourceInstantiationException {
    // set up the source URL and create the content
    if (sourceUrl == null) {
        if (stringContent == null) {
            throw new ResourceInstantiationException("The sourceURL and document's content were null.");
        }
        content = new DocumentContentImpl(stringContent);
        getFeatures().put("gate.SourceURL", "created from String");
    } else {
        try {
            content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
            getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
        } catch (IOException e) {
            throw new ResourceInstantiationException("DocumentImpl.init: " + e);
        }
    }
    if (preserveOriginalContent.booleanValue() && content != null) {
        String originalContent = ((DocumentContentImpl) content).getOriginalContent();
        getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
    }
    // set up a DocumentFormat if markup unpacking required
    if (getMarkupAware().booleanValue()) {
        DocumentFormat docFormat = null;
        // if a specific MIME type has been given, use it
        if (this.mimeType != null && this.mimeType.length() > 0) {
            MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
            if (theType == null) {
                throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
            }
            docFormat = DocumentFormat.getDocumentFormat(this, theType);
        } else {
            docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
        }
        try {
            if (docFormat != null) {
                StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
                if (sListener != null)
                    docFormat.addStatusListener(sListener);
                // set the flag if true and if the document format support collecting
                docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
                if (docFormat.getShouldCollectRepositioning().booleanValue()) {
                    // unpack with collectiong of repositioning information
                    RepositioningInfo info = new RepositioningInfo();
                    String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
                    RepositioningInfo ampCodingInfo = new RepositioningInfo();
                    if (origContent != null) {
                        boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
                        collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
                        if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
                            collectInformationForWS(origContent, ampCodingInfo);
                        }
                    // if
                    }
                    // if
                    docFormat.unpackMarkup(this, info, ampCodingInfo);
                    if (origContent != null && docFormat instanceof XmlDocumentFormat) {
                        // CRLF correction of RepositioningInfo
                        correctRepositioningForCRLFInXML(origContent, info);
                    }
                    // if
                    getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
                } else {
                    // normal old fashioned unpack
                    docFormat.unpackMarkup(this);
                }
                docFormat.removeStatusListener(sListener);
            }
        // if format != null
        } catch (DocumentFormatException e) {
            throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
        }
    }
    // }
    return this;
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) DocumentFormat(gate.DocumentFormat) IOException(java.io.IOException) StatusListener(gate.event.StatusListener) ResourceInstantiationException(gate.creole.ResourceInstantiationException)

Example 8 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    setNewLineProperty(doc);
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // this is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // Register a status listener with it
    emailDocHandler.addStatusListener(statusListener);
    try {
        // Call the method that creates annotations on the gate document
        emailDocHandler.annotateMessages();
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a = iter.next();
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            }
        // End while
        }
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
        emailDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler(gate.email.EmailDocumentHandler) EmailDocumentHandler(gate.email.EmailDocumentHandler) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) Annotation(gate.Annotation)

Example 9 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class TextualDocumentFormat method annotateParagraphs.

// removeExtraNewLine(Document doc)
/**
 * This method annotates paragraphs in a GATE document. The investigated text
 * spans beetween start and end offsets and the paragraph annotations are
 * created in the annotSetName. If annotSetName is null then they are creted
 * in the default annotation set.
 * @param aDoc is the gate document on which the paragraph detection would
 *  be performed.If it is null or its content it's null then the method woul
 *  simply return doing nothing.
 * @param startOffset is the index  form the document content from which the
 * paragraph detection will start
 * @param endOffset is the offset where the detection will end.
 * @param annotSetName is the name of the set in which paragraph annotation
 * would be created.The annotation type created will be "paragraph"
 */
public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException {
    // Simply return if the document is null or its content
    if (aDoc == null || aDoc.getContent() == null)
        return;
    // Simply return if the start is > than the end
    if (startOffset > endOffset)
        return;
    // Decide where to put the newly detected annotations
    AnnotationSet annotSet = null;
    if (annotSetName == null)
        annotSet = aDoc.getAnnotations();
    else
        annotSet = aDoc.getAnnotations(annotSetName);
    // Extract the document content
    String content = aDoc.getContent().toString();
    // This is the offset marking the start of a para
    int startOffsetPara = startOffset;
    // This marks the ned of a para
    int endOffsetPara = endOffset;
    // The initial sate of the FSA
    int state = 1;
    // This field marks that a BR entity was read
    // A BR entity can be NL or NL CR, depending on the operating system (UNIX
    // or DOS)
    boolean readBR = false;
    int index = startOffset;
    while (index < endOffset) {
        // Read the current char
        char ch = content.charAt(index);
        // Test if a BR entity was read
        if (ch == '\n') {
            readBR = true;
            // BR entity
            while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
        }
        // End if
        switch(state) {
            // Stay in state 1 while it reads whitespaces
            case 1:
                {
                    // the beggining of a paragraph
                    if (!Character.isWhitespace(ch)) {
                        state = 2;
                        startOffsetPara = index;
                    }
                // End if
                }
                break;
            // It can be also a final state.
            case 2:
                {
                    // Stay in state 2 while reading chars != BR entities
                    if (readBR) {
                        // If you find a BR char go to state 3. The possible end of the para
                        // can be index. This will be confirmed by state 3. So, this is why
                        // the end of a para is recorded here.
                        readBR = false;
                        endOffsetPara = index;
                        state = 3;
                    }
                // End if
                }
                break;
            // For state 2 it nead to read something different then a BR
            case 3:
                {
                    if (readBR) {
                        // A BR was read. Go to state 1
                        readBR = false;
                        state = 1;
                        // Create an annotation type paragraph
                        try {
                            annotSet.add(Long.valueOf(startOffsetPara), Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
                        } catch (gate.util.InvalidOffsetException ioe) {
                            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
                        }
                    // End try
                    } else {
                        // Go to state 2 an keep reading chars
                        state = 2;
                    }
                // End if
                }
                break;
        }
        // End switch
        // Prepare to read the next char.
        index++;
    }
    // End while
    endOffsetPara = index;
    // Investigate where the finite automata has stoped
    if (state == 2 || state == 3) {
        // Create an annotation type paragraph
        try {
            annotSet.add(Long.valueOf(startOffsetPara), // Create the final annotation using the endOffset
            Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
        } catch (gate.util.InvalidOffsetException ioe) {
            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
        }
    // End try
    }
// End if
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) gate(gate)

Example 10 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class UimaDocumentFormat method unpackCasMarkup.

/**
 * Convert UIMA CAS markups to GATE markups.
 * @param doc XML document already parsed
 * @throws DocumentFormatException error when parsing the file
 */
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
    AnnotationSet inputAS = doc.getAnnotations("Original markups");
    AnnotationSet outputAS = doc.getAnnotations("Original markups");
    // set format specific names
    String casPrefix;
    String idName;
    if (!inputAS.get("CAS").isEmpty()) {
        casPrefix = "uima.cas.";
        idName = "_id";
    } else if (!inputAS.get("xmi:XMI").isEmpty()) {
        casPrefix = "cas:";
        idName = "xmi:id";
    } else {
        throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
    }
    // get array/list contained elements annotations
    for (Annotation annotation : inputAS) {
        if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
            try {
                String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
                // add contained values as a feature to the array annotation
                if (!elements.trim().equals("")) {
                    annotation.getFeatures().put("elements", elements);
                }
            } catch (InvalidOffsetException e) {
                throw new DocumentFormatException(e);
            }
        }
    }
    // get document content from SOFA annotations
    Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
    if (sofaSet.size() > 1) {
        Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
    }
    StringBuilder documentContent = new StringBuilder();
    for (Annotation annotation : sofaSet) {
        documentContent.append((String) annotation.getFeatures().get("sofaString"));
    }
    doc.setContent(new DocumentContentImpl(documentContent.toString()));
    // remove SOFA annotations
    inputAS.removeAll(sofaSet);
    // remove non document annotations
    inputAS.removeAll(inputAS.get("CAS"));
    inputAS.removeAll(inputAS.get("xmi:XMI"));
    inputAS.removeAll(inputAS.get("cas:NULL"));
    // get the views members, views will be added later as annotation sets
    List<List<String>> viewList = new ArrayList<List<String>>();
    for (Annotation view : inputAS.get(casPrefix + "View")) {
        viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
    }
    inputAS.removeAll(inputAS.get(casPrefix + "View"));
    // fill a map with the id as key and the entity name as value
    // this is specific to the Temis Luxid CAS format
    Map<String, String> entityMap = new HashMap<String, String>();
    for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
        FeatureMap features = entity.getFeatures();
        entityMap.put((String) features.get(idName), (String) features.get("value"));
    }
    try {
        // for each UIMA annotation
        for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
            FeatureMap features = Factory.newFeatureMap();
            features.putAll(annotation.getFeatures());
            String start = (String) features.get("begin");
            String end = (String) features.get("end");
            String id = (String) features.get(idName);
            // UIMA feature
            features.remove("begin");
            // UIMA feature
            features.remove("end");
            // GATE feature
            features.remove("isEmptyAndSpan");
            // UIMA XCAS feature
            features.remove("_indexed");
            if (start == null || end == null) {
                // no offsets so add it as a GATE document feature
                features.remove(idName);
                for (Map.Entry<Object, Object> entry : features.entrySet()) {
                    doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
                }
            } else {
                // offsets so add it as a GATE document annotation
                String entityReference = (String) features.get("_ref_entity");
                String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
                Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
                int viewCount = 0;
                for (List<String> viewMembers : viewList) {
                    if (viewMembers.contains(id)) {
                        // add the annotation to the annotation set
                        doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
                    }
                    viewCount++;
                }
            }
            // delete UIMA annotation
            inputAS.remove(annotation);
        }
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException("Couldn't create annotation.", e);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) DocumentFormatException(gate.util.DocumentFormatException) FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) HashSet(java.util.HashSet)

Aggregations

DocumentFormatException (gate.util.DocumentFormatException)11 IOException (java.io.IOException)7 StatusListener (gate.event.StatusListener)6 TextualDocument (gate.TextualDocument)3 InvalidOffsetException (gate.util.InvalidOffsetException)3 XmlDocumentHandler (gate.xml.XmlDocumentHandler)3 InputStream (java.io.InputStream)3 InputStreamReader (java.io.InputStreamReader)3 Reader (java.io.Reader)3 StringReader (java.io.StringReader)3 SAXException (org.xml.sax.SAXException)3 Annotation (gate.Annotation)2 AnnotationSet (gate.AnnotationSet)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)2 XMLStreamReader (javax.xml.stream.XMLStreamReader)2 gate (gate)1 DocumentFormat (gate.DocumentFormat)1 FeatureMap (gate.FeatureMap)1 EmailDocumentHandler (gate.email.EmailDocumentHandler)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1