use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.
the class HtmlDocumentHandler method handleEndTag.
// handleStartTag
/**
* This method is called when the HTML parser encounts the end of a tag
* that means that the tag is paired by a beginning tag
*/
@Override
public void handleEndTag(HTML.Tag t, int pos) {
// obj is for internal use
CustomObject obj = null;
// end of STYLE tag
if (HTML.Tag.STYLE.equals(t)) {
isInsideStyleTag = false;
}
// If the stack is not empty then we get the object from the stack
if (!stack.isEmpty()) {
obj = stack.pop();
// emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
if (obj.getStart().equals(obj.getEnd())) {
// The element had an end tag and its start was equal to its end. Hence
// it is anEmptyAndSpan one.
obj.getFM().put("isEmptyAndSpan", "true");
}
// End iff
// we add it to the colector
colector.add(obj);
}
// If element has text between, then customize its apearance
if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
// Customize the appearance of the document
customizeAppearanceOfDocumentWithEndTag(t);
// if t is the </HTML> tag then we reached the end of theHTMLdocument
if (t == HTML.Tag.HTML) {
// replace the old content with the new one
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// set from this gate document
if (basicAS == null)
basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// sort colector ascending on its id
Collections.sort(colector);
// iterate through colector and construct annotations
while (!colector.isEmpty()) {
obj = colector.getFirst();
colector.remove(obj);
// Construct an annotation from this obj
try {
if (markupElementsMap == null) {
basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
} else {
String annotationType = markupElementsMap.get(obj.getElemName());
if (annotationType != null)
basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
}
} catch (InvalidOffsetException e) {
Err.prln("Error creating an annot :" + obj + " Discarded...");
}
// end try
// }// end if
}
// while
// notify the listener about the total amount of elements that
// has been processed
fireStatusChangedEvent("Total elements : " + elements);
}
// else
}
use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.
the class XmlDocumentHandler method endDocument.
/**
* This method is called when the SAX parser encounts the end of the
* XML document.
* Here we set the content of the gate Document to be the one generated
* inside this class (tmpDocContent).
* After that we use the colector to generate all the annotation reffering
* this new gate document.
*/
@Override
public void endDocument() throws org.xml.sax.SAXException {
// replace the document content with the one without markups
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// fire the status listener
fireStatusChangedEvent("Total elements: " + elements);
// based on the gate document.
if (basicAS == null) {
basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
}
// sort colector ascending on its id
Collections.sort(colector);
Set<Integer> testIdsSet = new HashSet<Integer>();
// create all the annotations (on this new document) from the collector
while (!colector.isEmpty()) {
CustomObject obj = colector.getFirst();
// Test to see if there are two annotation objects with the same id.
if (testIdsSet.contains(obj.getId())) {
throw new GateSaxException("Found two annotations with the same Id(" + obj.getId() + ").The document is inconsistent.");
} else {
testIdsSet.add(obj.getId());
}
// create a new annotation and add it to the annotation set
try {
// add the annotation to the Annotation Set
if (markupElementsMap == null) {
basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
} else {
// get the type of the annotation from Map
String annotationType = markupElementsMap.get(obj.getElemName());
if (annotationType != null) {
basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
}
}
// End if
} catch (gate.util.InvalidOffsetException e) {
Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + " with Id =" + obj.getId() + ". Discarded...");
}
// End try
colector.remove(obj);
}
// End while
}
use of gate.corpora.DocumentContentImpl in project gate-core by GateNLP.
the class NekoHtmlDocumentHandler method endDocument.
/**
* Called when the parser reaches the end of the document. Here we
* store the new content and construct the Original markups
* annotations.
*/
@Override
public void endDocument(Augmentations augs) throws XNIException {
if (DEBUG_GENERAL) {
Out.println("endDocument");
}
CustomObject obj = null;
// replace the old content with the new one
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// set from this gate document
if (basicAS == null)
basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// sort colector ascending on its id
Collections.sort(colector);
// iterate through colector and construct annotations
while (!colector.isEmpty()) {
obj = colector.getFirst();
colector.remove(obj);
// Construct an annotation from this obj
try {
basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
} catch (InvalidOffsetException e) {
Err.prln("Error creating an annot :" + obj + " Discarded...");
}
// end try
// }// end if
}
// while
// notify the listener about the total amount of elements that
// has been processed
fireStatusChangedEvent("Total elements : " + elements);
}
Aggregations