use of gate in project gate-core by GateNLP.
the class TextualDocumentFormat method annotateParagraphs.
// removeExtraNewLine(Document doc)
/**
* This method annotates paragraphs in a GATE document. The investigated text
* spans beetween start and end offsets and the paragraph annotations are
* created in the annotSetName. If annotSetName is null then they are creted
* in the default annotation set.
* @param aDoc is the gate document on which the paragraph detection would
* be performed.If it is null or its content it's null then the method woul
* simply return doing nothing.
* @param startOffset is the index form the document content from which the
* paragraph detection will start
* @param endOffset is the offset where the detection will end.
* @param annotSetName is the name of the set in which paragraph annotation
* would be created.The annotation type created will be "paragraph"
*/
public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException {
// Simply return if the document is null or its content
if (aDoc == null || aDoc.getContent() == null)
return;
// Simply return if the start is > than the end
if (startOffset > endOffset)
return;
// Decide where to put the newly detected annotations
AnnotationSet annotSet = null;
if (annotSetName == null)
annotSet = aDoc.getAnnotations();
else
annotSet = aDoc.getAnnotations(annotSetName);
// Extract the document content
String content = aDoc.getContent().toString();
// This is the offset marking the start of a para
int startOffsetPara = startOffset;
// This marks the ned of a para
int endOffsetPara = endOffset;
// The initial sate of the FSA
int state = 1;
// This field marks that a BR entity was read
// A BR entity can be NL or NL CR, depending on the operating system (UNIX
// or DOS)
boolean readBR = false;
int index = startOffset;
while (index < endOffset) {
// Read the current char
char ch = content.charAt(index);
// Test if a BR entity was read
if (ch == '\n') {
readBR = true;
// BR entity
while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
}
// End if
switch(state) {
// Stay in state 1 while it reads whitespaces
case 1:
{
// the beggining of a paragraph
if (!Character.isWhitespace(ch)) {
state = 2;
startOffsetPara = index;
}
// End if
}
break;
// It can be also a final state.
case 2:
{
// Stay in state 2 while reading chars != BR entities
if (readBR) {
// If you find a BR char go to state 3. The possible end of the para
// can be index. This will be confirmed by state 3. So, this is why
// the end of a para is recorded here.
readBR = false;
endOffsetPara = index;
state = 3;
}
// End if
}
break;
// For state 2 it nead to read something different then a BR
case 3:
{
if (readBR) {
// A BR was read. Go to state 1
readBR = false;
state = 1;
// Create an annotation type paragraph
try {
annotSet.add(Long.valueOf(startOffsetPara), Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
} catch (gate.util.InvalidOffsetException ioe) {
throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
}
// End try
} else {
// Go to state 2 an keep reading chars
state = 2;
}
// End if
}
break;
}
// End switch
// Prepare to read the next char.
index++;
}
// End while
endOffsetPara = index;
// Investigate where the finite automata has stoped
if (state == 2 || state == 3) {
// Create an annotation type paragraph
try {
annotSet.add(Long.valueOf(startOffsetPara), // Create the final annotation using the endOffset
Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
} catch (gate.util.InvalidOffsetException ioe) {
throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
}
// End try
}
// End if
}
Aggregations