Search in sources :

Example 1 with DocumentContentImpl

use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.

the class HtmlDocumentHandler method handleEndTag.

// handleStartTag
/**
 * This method is called when the HTML parser encounts the end of a tag
 * that means that the tag is paired by a beginning tag
 */
@Override
public void handleEndTag(HTML.Tag t, int pos) {
    // obj is for internal use
    CustomObject obj = null;
    // end of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
        isInsideStyleTag = false;
    }
    // If the stack is not empty then we get the object from the stack
    if (!stack.isEmpty()) {
        obj = stack.pop();
        // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
        if (obj.getStart().equals(obj.getEnd())) {
            // The element had an end tag and its start was equal to its end. Hence
            // it is anEmptyAndSpan one.
            obj.getFM().put("isEmptyAndSpan", "true");
        }
        // End iff
        // we add it to the colector
        colector.add(obj);
    }
    // If element has text between, then customize its apearance
    if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
        // Customize the appearance of the document
        customizeAppearanceOfDocumentWithEndTag(t);
    // if t is the </HTML> tag then we reached the end of theHTMLdocument
    if (t == HTML.Tag.HTML) {
        // replace the old content with the new one
        doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
        // set from this gate document
        if (basicAS == null)
            basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        // sort colector ascending on its id
        Collections.sort(colector);
        // iterate through colector and construct annotations
        while (!colector.isEmpty()) {
            obj = colector.getFirst();
            colector.remove(obj);
            // Construct an annotation from this obj
            try {
                if (markupElementsMap == null) {
                    basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
                } else {
                    String annotationType = markupElementsMap.get(obj.getElemName());
                    if (annotationType != null)
                        basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
                }
            } catch (InvalidOffsetException e) {
                Err.prln("Error creating an annot :" + obj + " Discarded...");
            }
        // end try
        // }// end if
        }
        // while
        // notify the listener about the total amount of elements that
        // has been processed
        fireStatusChangedEvent("Total elements : " + elements);
    }
// else
}
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 2 with DocumentContentImpl

use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.

the class XmlDocumentHandler method endDocument.

/**
 * This method is called when the SAX parser encounts the end of the
 * XML document.
 * Here we set the content of the gate Document to be the one generated
 * inside this class (tmpDocContent).
 * After that we use the colector to generate all the annotation reffering
 * this new gate document.
 */
@Override
public void endDocument() throws org.xml.sax.SAXException {
    // replace the document content with the one without markups
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
    // fire the status listener
    fireStatusChangedEvent("Total elements: " + elements);
    // based on the gate document.
    if (basicAS == null) {
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    }
    // sort colector ascending on its id
    Collections.sort(colector);
    Set<Integer> testIdsSet = new HashSet<Integer>();
    // create all the annotations (on this new document) from the collector
    while (!colector.isEmpty()) {
        CustomObject obj = colector.getFirst();
        // Test to see if there are two annotation objects with the same id.
        if (testIdsSet.contains(obj.getId())) {
            throw new GateSaxException("Found two annotations with the same Id(" + obj.getId() + ").The document is inconsistent.");
        } else {
            testIdsSet.add(obj.getId());
        }
        // create a new annotation and add it to the annotation set
        try {
            // add the annotation to the Annotation Set
            if (markupElementsMap == null) {
                basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
            } else {
                // get the type of the annotation from Map
                String annotationType = markupElementsMap.get(obj.getElemName());
                if (annotationType != null) {
                    basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
                }
            }
        // End if
        } catch (gate.util.InvalidOffsetException e) {
            Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + " with Id =" + obj.getId() + ". Discarded...");
        }
        // End try
        colector.remove(obj);
    }
// End while
}
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) GateSaxException(gate.util.GateSaxException) HashSet(java.util.HashSet)

Example 3 with DocumentContentImpl

use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.

the class NekoHtmlDocumentHandler method endDocument.

/**
 * Called when the parser reaches the end of the document. Here we
 * store the new content and construct the Original markups
 * annotations.
 */
@Override
public void endDocument(Augmentations augs) throws XNIException {
    if (DEBUG_GENERAL) {
        Out.println("endDocument");
    }
    CustomObject obj = null;
    // replace the old content with the new one
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
    // set from this gate document
    if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // sort colector ascending on its id
    Collections.sort(colector);
    // iterate through colector and construct annotations
    while (!colector.isEmpty()) {
        obj = colector.getFirst();
        colector.remove(obj);
        // Construct an annotation from this obj
        try {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
        } catch (InvalidOffsetException e) {
            Err.prln("Error creating an annot :" + obj + " Discarded...");
        }
    // end try
    // }// end if
    }
    // while
    // notify the listener about the total amount of elements that
    // has been processed
    fireStatusChangedEvent("Total elements : " + elements);
}
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) InvalidOffsetException(gate.util.InvalidOffsetException)

Aggregations

DocumentContentImpl (gate.corpora.DocumentContentImpl)3 InvalidOffsetException (gate.util.InvalidOffsetException)2 GateSaxException (gate.util.GateSaxException)1 HashSet (java.util.HashSet)1