Search in sources :

Example 1 with gate

use of gate in project gate-core by GateNLP.

the class TextualDocumentFormat method annotateParagraphs.

// removeExtraNewLine(Document doc)
/**
 * This method annotates paragraphs in a GATE document. The investigated text
 * spans beetween start and end offsets and the paragraph annotations are
 * created in the annotSetName. If annotSetName is null then they are creted
 * in the default annotation set.
 * @param aDoc is the gate document on which the paragraph detection would
 *  be performed.If it is null or its content it's null then the method woul
 *  simply return doing nothing.
 * @param startOffset is the index  form the document content from which the
 * paragraph detection will start
 * @param endOffset is the offset where the detection will end.
 * @param annotSetName is the name of the set in which paragraph annotation
 * would be created.The annotation type created will be "paragraph"
 */
public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException {
    // Simply return if the document is null or its content
    if (aDoc == null || aDoc.getContent() == null)
        return;
    // Simply return if the start is > than the end
    if (startOffset > endOffset)
        return;
    // Decide where to put the newly detected annotations
    AnnotationSet annotSet = null;
    if (annotSetName == null)
        annotSet = aDoc.getAnnotations();
    else
        annotSet = aDoc.getAnnotations(annotSetName);
    // Extract the document content
    String content = aDoc.getContent().toString();
    // This is the offset marking the start of a para
    int startOffsetPara = startOffset;
    // This marks the ned of a para
    int endOffsetPara = endOffset;
    // The initial sate of the FSA
    int state = 1;
    // This field marks that a BR entity was read
    // A BR entity can be NL or NL CR, depending on the operating system (UNIX
    // or DOS)
    boolean readBR = false;
    int index = startOffset;
    while (index < endOffset) {
        // Read the current char
        char ch = content.charAt(index);
        // Test if a BR entity was read
        if (ch == '\n') {
            readBR = true;
            // BR entity
            while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
        }
        // End if
        switch(state) {
            // Stay in state 1 while it reads whitespaces
            case 1:
                {
                    // the beggining of a paragraph
                    if (!Character.isWhitespace(ch)) {
                        state = 2;
                        startOffsetPara = index;
                    }
                // End if
                }
                break;
            // It can be also a final state.
            case 2:
                {
                    // Stay in state 2 while reading chars != BR entities
                    if (readBR) {
                        // If you find a BR char go to state 3. The possible end of the para
                        // can be index. This will be confirmed by state 3. So, this is why
                        // the end of a para is recorded here.
                        readBR = false;
                        endOffsetPara = index;
                        state = 3;
                    }
                // End if
                }
                break;
            // For state 2 it nead to read something different then a BR
            case 3:
                {
                    if (readBR) {
                        // A BR was read. Go to state 1
                        readBR = false;
                        state = 1;
                        // Create an annotation type paragraph
                        try {
                            annotSet.add(Long.valueOf(startOffsetPara), Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
                        } catch (gate.util.InvalidOffsetException ioe) {
                            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
                        }
                    // End try
                    } else {
                        // Go to state 2 an keep reading chars
                        state = 2;
                    }
                // End if
                }
                break;
        }
        // End switch
        // Prepare to read the next char.
        index++;
    }
    // End while
    endOffsetPara = index;
    // Investigate where the finite automata has stoped
    if (state == 2 || state == 3) {
        // Create an annotation type paragraph
        try {
            annotSet.add(Long.valueOf(startOffsetPara), // Create the final annotation using the endOffset
            Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
        } catch (gate.util.InvalidOffsetException ioe) {
            throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
        }
    // End try
    }
// End if
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) gate(gate)

Aggregations

gate (gate)1 DocumentFormatException (gate.util.DocumentFormatException)1