Search in sources :

Example 6 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class Annotandum method unpackMarkup.

public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    String[] lines = doc.getContent().toString().split("[\\n\\r]+");
    StringBuilder newContent = new StringBuilder();
    // Items of data to be turned into Original markups annotations
    List<Annotandum> annotanda = new ArrayList<Annotandum>();
    // Currently open tags: created by "B-FOO", extended by "I-FOO", closed
    // by "O" or end of sentence.
    Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
    /* Note: I-Foo handling currently has a weak spot.
     * this    B-Foo
     * is      B-Bar
     * strange I-Foo
     * will result in a Foo annotation spanning "this is strange", because
     * the I-Foo extends the existing B-Foo.  If the sentence is cut off 
     * before hitting another I-Foo, however, the Foo annotation will not
     * have been extended.  But this situation will not occur in carefully
     * edited input.  
    long oldEnd = 0L;
    long start = 0L;
    long end = 0L;
    for (String line : lines) {
        oldEnd = end;
        start = newContent.length();
        String[] items = line.split("\\s+");
        // any annotations in progress
        if (items.length == 0) {
            end = newContent.length();
            finishAllTags(inProgress, annotanda, oldEnd);
        } else {
            String token = items[0];
            // We've agreed to put the space after every token.
            end = newContent.length();
            newContent.append(' ');
            // Create Token and following SpaceToken annotation.
            annotanda.add(Annotandum.makeToken(start, end, token));
            for (int column = 1; column < items.length; column++) {
                // O means close all annotations in progress
                if (items[column].equals("O")) {
                    finishAllTags(inProgress, annotanda, oldEnd);
                } else // annotation, after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("U-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    annotanda.add(new Annotandum(type, start, end, column, true));
                } else // close any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("L-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good L-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    finishTag(type, inProgress, annotanda, end);
                } else // after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("B-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "I-FOO": extend current "FOO" annotation
                if ((items[column].length() > 2) && items[column].startsWith("I-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good I-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "FOO": treat as single-token annotation (such as POS tag)
                    Annotandum tag = new Annotandum(items[column], start, end, column, false);
    // end of input: close any remaining annotations
    finishAllTags(inProgress, annotanda, end);
    // set new content & create Original markups annotations
    try {
        DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
        doc.edit(0L, doc.getContent().size(), newContentImpl);
        long newSize = doc.getContent().size();
        AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        for (Annotandum ann : annotanda) {
            if (DEBUG) {
                String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
                System.out.format("%d  %d  %s  %s\n", ann.startOffset, ann.endOffset, ann.type, string);
            originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
Also used : InvalidOffsetException(gate.util.InvalidOffsetException) DocumentFormatException(gate.util.DocumentFormatException)

Example 7 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class DocumentImpl method init.

 * Initialise this resource, and return it.
public Resource init() throws ResourceInstantiationException {
    // set up the source URL and create the content
    if (sourceUrl == null) {
        if (stringContent == null) {
            throw new ResourceInstantiationException("The sourceURL and document's content were null.");
        content = new DocumentContentImpl(stringContent);
        getFeatures().put("gate.SourceURL", "created from String");
    } else {
        try {
            content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
            getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
        } catch (IOException e) {
            throw new ResourceInstantiationException("DocumentImpl.init: " + e);
    if (preserveOriginalContent.booleanValue() && content != null) {
        String originalContent = ((DocumentContentImpl) content).getOriginalContent();
        getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
    // set up a DocumentFormat if markup unpacking required
    if (getMarkupAware().booleanValue()) {
        DocumentFormat docFormat = null;
        // if a specific MIME type has been given, use it
        if (this.mimeType != null && this.mimeType.length() > 0) {
            MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
            if (theType == null) {
                throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
            docFormat = DocumentFormat.getDocumentFormat(this, theType);
        } else {
            docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
        try {
            if (docFormat != null) {
                StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
                if (sListener != null)
                // set the flag if true and if the document format support collecting
                if (docFormat.getShouldCollectRepositioning().booleanValue()) {
                    // unpack with collectiong of repositioning information
                    RepositioningInfo info = new RepositioningInfo();
                    String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
                    RepositioningInfo ampCodingInfo = new RepositioningInfo();
                    if (origContent != null) {
                        boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
                        collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
                        if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
                            collectInformationForWS(origContent, ampCodingInfo);
                    // if
                    // if
                    docFormat.unpackMarkup(this, info, ampCodingInfo);
                    if (origContent != null && docFormat instanceof XmlDocumentFormat) {
                        // CRLF correction of RepositioningInfo
                        correctRepositioningForCRLFInXML(origContent, info);
                    // if
                    getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
                } else {
                    // normal old fashioned unpack
        // if format != null
        } catch (DocumentFormatException e) {
            throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
    // }
    return this;
Also used : DocumentFormatException(gate.util.DocumentFormatException) DocumentFormat(gate.DocumentFormat) IOException( StatusListener(gate.event.StatusListener) ResourceInstantiationException(gate.creole.ResourceInstantiationException)

Example 8 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 * @param doc The gate document you want to parse.
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    // End if
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        public void statusChanged(String text) {
            // this is implemented in and inherited here
    // Register a status listener with it
    try {
        // Call the method that creates annotations on the gate document
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a =;
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        // End while
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
// End try
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler( EmailDocumentHandler( AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException( Annotation(gate.Annotation)

Example 9 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class TextualDocumentFormat method annotateParagraphs.

// removeExtraNewLine(Document doc)
 * This method annotates paragraphs in a GATE document. The investigated text
 * spans beetween start and end offsets and the paragraph annotations are
 * created in the annotSetName. If annotSetName is null then they are creted
 * in the default annotation set.
 * @param aDoc is the gate document on which the paragraph detection would
 *  be performed.If it is null or its content it's null then the method woul
 *  simply return doing nothing.
 * @param startOffset is the index  form the document content from which the
 * paragraph detection will start
 * @param endOffset is the offset where the detection will end.
 * @param annotSetName is the name of the set in which paragraph annotation
 * would be created.The annotation type created will be "paragraph"
public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException {
    // Simply return if the document is null or its content
    if (aDoc == null || aDoc.getContent() == null)
    // Simply return if the start is > than the end
    if (startOffset > endOffset)
    // Decide where to put the newly detected annotations
    AnnotationSet annotSet = null;
    if (annotSetName == null)
        annotSet = aDoc.getAnnotations();
        annotSet = aDoc.getAnnotations(annotSetName);
    // Extract the document content
    String content = aDoc.getContent().toString();
    // This is the offset marking the start of a para
    int startOffsetPara = startOffset;
    // This marks the ned of a para
    int endOffsetPara = endOffset;
    // The initial sate of the FSA
    int state = 1;
    // This field marks that a BR entity was read
    // A BR entity can be NL or NL CR, depending on the operating system (UNIX
    // or DOS)
    boolean readBR = false;
    int index = startOffset;
    while (index < endOffset) {
        // Read the current char
        char ch = content.charAt(index);
        // Test if a BR entity was read
        if (ch == '\n') {
            readBR = true;
            // BR entity
            while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
        // End if
        switch(state) {
            // Stay in state 1 while it reads whitespaces
            case 1:
                    // the beggining of a paragraph
                    if (!Character.isWhitespace(ch)) {
                        state = 2;
                        startOffsetPara = index;
                // End if
            // It can be also a final state.
            case 2:
                    // Stay in state 2 while reading chars != BR entities
                    if (readBR) {
                        // If you find a BR char go to state 3. The possible end of the para
                        // can be index. This will be confirmed by state 3. So, this is why
                        // the end of a para is recorded here.
                        readBR = false;
                        endOffsetPara = index;
                        state = 3;
                // End if
            // For state 2 it nead to read something different then a BR
            case 3:
                    if (readBR) {
                        // A BR was read. Go to state 1
                        readBR = false;
                        state = 1;
                        // Create an annotation type paragraph
                        try {
                            annotSet.add(Long.valueOf(startOffsetPara), Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
                        } catch (gate.util.InvalidOffsetException ioe) {
                            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
                    // End try
                    } else {
                        // Go to state 2 an keep reading chars
                        state = 2;
                // End if
        // End switch
        // Prepare to read the next char.
    // End while
    endOffsetPara = index;
    // Investigate where the finite automata has stoped
    if (state == 2 || state == 3) {
        // Create an annotation type paragraph
        try {
            annotSet.add(Long.valueOf(startOffsetPara), // Create the final annotation using the endOffset
            Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
        } catch (gate.util.InvalidOffsetException ioe) {
            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
    // End try
// End if
Also used : DocumentFormatException(gate.util.DocumentFormatException) gate(gate)

Example 10 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class UimaDocumentFormat method unpackCasMarkup.

 * Convert UIMA CAS markups to GATE markups.
 * @param doc XML document already parsed
 * @throws DocumentFormatException error when parsing the file
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
    AnnotationSet inputAS = doc.getAnnotations("Original markups");
    AnnotationSet outputAS = doc.getAnnotations("Original markups");
    // set format specific names
    String casPrefix;
    String idName;
    if (!inputAS.get("CAS").isEmpty()) {
        casPrefix = "uima.cas.";
        idName = "_id";
    } else if (!inputAS.get("xmi:XMI").isEmpty()) {
        casPrefix = "cas:";
        idName = "xmi:id";
    } else {
        throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
    // get array/list contained elements annotations
    for (Annotation annotation : inputAS) {
        if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
            try {
                String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
                // add contained values as a feature to the array annotation
                if (!elements.trim().equals("")) {
                    annotation.getFeatures().put("elements", elements);
            } catch (InvalidOffsetException e) {
                throw new DocumentFormatException(e);
    // get document content from SOFA annotations
    Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
    if (sofaSet.size() > 1) {
        Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
    StringBuilder documentContent = new StringBuilder();
    for (Annotation annotation : sofaSet) {
        documentContent.append((String) annotation.getFeatures().get("sofaString"));
    doc.setContent(new DocumentContentImpl(documentContent.toString()));
    // remove SOFA annotations
    // remove non document annotations
    // get the views members, views will be added later as annotation sets
    List<List<String>> viewList = new ArrayList<List<String>>();
    for (Annotation view : inputAS.get(casPrefix + "View")) {
        viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
    inputAS.removeAll(inputAS.get(casPrefix + "View"));
    // fill a map with the id as key and the entity name as value
    // this is specific to the Temis Luxid CAS format
    Map<String, String> entityMap = new HashMap<String, String>();
    for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
        FeatureMap features = entity.getFeatures();
        entityMap.put((String) features.get(idName), (String) features.get("value"));
    try {
        // for each UIMA annotation
        for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
            FeatureMap features = Factory.newFeatureMap();
            String start = (String) features.get("begin");
            String end = (String) features.get("end");
            String id = (String) features.get(idName);
            // UIMA feature
            // UIMA feature
            // GATE feature
            // UIMA XCAS feature
            if (start == null || end == null) {
                // no offsets so add it as a GATE document feature
                for (Map.Entry<Object, Object> entry : features.entrySet()) {
                    doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
            } else {
                // offsets so add it as a GATE document annotation
                String entityReference = (String) features.get("_ref_entity");
                String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
                Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
                int viewCount = 0;
                for (List<String> viewMembers : viewList) {
                    if (viewMembers.contains(id)) {
                        // add the annotation to the annotation set
                        doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
            // delete UIMA annotation
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException("Couldn't create annotation.", e);
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) DocumentFormatException(gate.util.DocumentFormatException) FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) HashSet(java.util.HashSet)


DocumentFormatException (gate.util.DocumentFormatException)11 IOException ( StatusListener (gate.event.StatusListener)6 TextualDocument (gate.TextualDocument)3 InvalidOffsetException (gate.util.InvalidOffsetException)3 XmlDocumentHandler (gate.xml.XmlDocumentHandler)3 InputStream ( InputStreamReader ( Reader ( StringReader ( SAXException (org.xml.sax.SAXException)3 Annotation (gate.Annotation)2 AnnotationSet (gate.AnnotationSet)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)2 XMLStreamReader ( gate (gate)1 DocumentFormat (gate.DocumentFormat)1 FeatureMap (gate.FeatureMap)1 EmailDocumentHandler ( NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1