use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class Annotandum method unpackMarkup.
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
setNewLineProperty(doc);
String[] lines = doc.getContent().toString().split("[\\n\\r]+");
StringBuilder newContent = new StringBuilder();
// Items of data to be turned into Original markups annotations
List<Annotandum> annotanda = new ArrayList<Annotandum>();
// Currently open tags: created by "B-FOO", extended by "I-FOO", closed
// by "O" or end of sentence.
Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
/* Note: I-Foo handling currently has a weak spot.
*
* this B-Foo
* is B-Bar
* strange I-Foo
*
* will result in a Foo annotation spanning "this is strange", because
* the I-Foo extends the existing B-Foo. If the sentence is cut off
* before hitting another I-Foo, however, the Foo annotation will not
* have been extended. But this situation will not occur in carefully
* edited input.
*/
long oldEnd = 0L;
long start = 0L;
long end = 0L;
for (String line : lines) {
oldEnd = end;
start = newContent.length();
String[] items = line.split("\\s+");
// any annotations in progress
if (items.length == 0) {
newContent.append("\n");
end = newContent.length();
finishAllTags(inProgress, annotanda, oldEnd);
} else {
String token = items[0];
// We've agreed to put the space after every token.
newContent.append(token);
end = newContent.length();
newContent.append(' ');
// Create Token and following SpaceToken annotation.
annotanda.add(Annotandum.makeToken(start, end, token));
annotanda.add(Annotandum.makeSpaceToken(end));
for (int column = 1; column < items.length; column++) {
// O means close all annotations in progress
if (items[column].equals("O")) {
finishAllTags(inProgress, annotanda, oldEnd);
} else // annotation, after closing any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("U-")) {
String type = items[column].substring(2);
finishTag(type, inProgress, annotanda, oldEnd);
annotanda.add(new Annotandum(type, start, end, column, true));
} else // close any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("L-")) {
String type = items[column].substring(2);
if (inProgress.containsKey(type)) {
// good L-FOO, so update the end offset
inProgress.get(type).endOffset = end;
} else {
// bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
inProgress.put(type, new Annotandum(type, start, end, column, true));
}
finishTag(type, inProgress, annotanda, end);
} else // after closing any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("B-")) {
String type = items[column].substring(2);
finishTag(type, inProgress, annotanda, oldEnd);
inProgress.put(type, new Annotandum(type, start, end, column, true));
} else // "I-FOO": extend current "FOO" annotation
if ((items[column].length() > 2) && items[column].startsWith("I-")) {
String type = items[column].substring(2);
if (inProgress.containsKey(type)) {
// good I-FOO, so update the end offset
inProgress.get(type).endOffset = end;
} else {
// bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
inProgress.put(type, new Annotandum(type, start, end, column, true));
}
} else // "FOO": treat as single-token annotation (such as POS tag)
{
Annotandum tag = new Annotandum(items[column], start, end, column, false);
annotanda.add(tag);
}
}
}
}
// end of input: close any remaining annotations
finishAllTags(inProgress, annotanda, end);
// set new content & create Original markups annotations
try {
DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
doc.edit(0L, doc.getContent().size(), newContentImpl);
long newSize = doc.getContent().size();
AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
for (Annotandum ann : annotanda) {
if (DEBUG) {
String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
System.out.format("%d %d %s %s\n", ann.startOffset, ann.endOffset, ann.type, string);
}
originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
}
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class DocumentImpl method init.
/**
* Initialise this resource, and return it.
*/
@Override
public Resource init() throws ResourceInstantiationException {
// set up the source URL and create the content
if (sourceUrl == null) {
if (stringContent == null) {
throw new ResourceInstantiationException("The sourceURL and document's content were null.");
}
content = new DocumentContentImpl(stringContent);
getFeatures().put("gate.SourceURL", "created from String");
} else {
try {
content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
} catch (IOException e) {
throw new ResourceInstantiationException("DocumentImpl.init: " + e);
}
}
if (preserveOriginalContent.booleanValue() && content != null) {
String originalContent = ((DocumentContentImpl) content).getOriginalContent();
getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, originalContent);
}
// set up a DocumentFormat if markup unpacking required
if (getMarkupAware().booleanValue()) {
DocumentFormat docFormat = null;
// if a specific MIME type has been given, use it
if (this.mimeType != null && this.mimeType.length() > 0) {
MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
if (theType == null) {
throw new ResourceInstantiationException("MIME type \"" + this.mimeType + " has no registered DocumentFormat");
}
docFormat = DocumentFormat.getDocumentFormat(this, theType);
} else {
docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
}
try {
if (docFormat != null) {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
if (sListener != null)
docFormat.addStatusListener(sListener);
// set the flag if true and if the document format support collecting
docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
if (docFormat.getShouldCollectRepositioning().booleanValue()) {
// unpack with collectiong of repositioning information
RepositioningInfo info = new RepositioningInfo();
String origContent = (String) getFeatures().get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
RepositioningInfo ampCodingInfo = new RepositioningInfo();
if (origContent != null) {
boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
collectInformationForAmpCodding(origContent, ampCodingInfo, shouldCorrectCR);
if (docFormat.getMimeType().equals(new MimeType("text", "html"))) {
collectInformationForWS(origContent, ampCodingInfo);
}
// if
}
// if
docFormat.unpackMarkup(this, info, ampCodingInfo);
if (origContent != null && docFormat instanceof XmlDocumentFormat) {
// CRLF correction of RepositioningInfo
correctRepositioningForCRLFInXML(origContent, info);
}
// if
getFeatures().put(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
} else {
// normal old fashioned unpack
docFormat.unpackMarkup(this);
}
docFormat.removeStatusListener(sListener);
}
// if format != null
} catch (DocumentFormatException e) {
throw new ResourceInstantiationException("Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") + "!", e);
}
}
// }
return this;
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class EmailDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. EMAIL) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* It always tryes to parse te doc's content. It doesn't matter if the
* sourceUrl is null or not.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
setNewLineProperty(doc);
// create an EmailDocumentHandler
EmailDocumentHandler emailDocHandler = null;
emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// this is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// Register a status listener with it
emailDocHandler.addStatusListener(statusListener);
try {
// Call the method that creates annotations on the gate document
emailDocHandler.annotateMessages();
// Process the body annotations and search for paragraphs
AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
Iterator<Annotation> iter = bodyAnnotations.iterator();
while (iter.hasNext()) {
Annotation a = iter.next();
annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
}
// End while
}
// End if
} catch (IOException e) {
throw new DocumentFormatException("Couldn't create a buffered reader ", e);
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
} finally {
emailDocHandler.removeStatusListener(statusListener);
}
// End try
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class TextualDocumentFormat method annotateParagraphs.
// removeExtraNewLine(Document doc)
/**
* This method annotates paragraphs in a GATE document. The investigated text
* spans beetween start and end offsets and the paragraph annotations are
* created in the annotSetName. If annotSetName is null then they are creted
* in the default annotation set.
* @param aDoc is the gate document on which the paragraph detection would
* be performed.If it is null or its content it's null then the method woul
* simply return doing nothing.
* @param startOffset is the index form the document content from which the
* paragraph detection will start
* @param endOffset is the offset where the detection will end.
* @param annotSetName is the name of the set in which paragraph annotation
* would be created.The annotation type created will be "paragraph"
*/
public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException {
// Simply return if the document is null or its content
if (aDoc == null || aDoc.getContent() == null)
return;
// Simply return if the start is > than the end
if (startOffset > endOffset)
return;
// Decide where to put the newly detected annotations
AnnotationSet annotSet = null;
if (annotSetName == null)
annotSet = aDoc.getAnnotations();
else
annotSet = aDoc.getAnnotations(annotSetName);
// Extract the document content
String content = aDoc.getContent().toString();
// This is the offset marking the start of a para
int startOffsetPara = startOffset;
// This marks the ned of a para
int endOffsetPara = endOffset;
// The initial sate of the FSA
int state = 1;
// This field marks that a BR entity was read
// A BR entity can be NL or NL CR, depending on the operating system (UNIX
// or DOS)
boolean readBR = false;
int index = startOffset;
while (index < endOffset) {
// Read the current char
char ch = content.charAt(index);
// Test if a BR entity was read
if (ch == '\n') {
readBR = true;
// BR entity
while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
}
// End if
switch(state) {
// Stay in state 1 while it reads whitespaces
case 1:
{
// the beggining of a paragraph
if (!Character.isWhitespace(ch)) {
state = 2;
startOffsetPara = index;
}
// End if
}
break;
// It can be also a final state.
case 2:
{
// Stay in state 2 while reading chars != BR entities
if (readBR) {
// If you find a BR char go to state 3. The possible end of the para
// can be index. This will be confirmed by state 3. So, this is why
// the end of a para is recorded here.
readBR = false;
endOffsetPara = index;
state = 3;
}
// End if
}
break;
// For state 2 it nead to read something different then a BR
case 3:
{
if (readBR) {
// A BR was read. Go to state 1
readBR = false;
state = 1;
// Create an annotation type paragraph
try {
annotSet.add(Long.valueOf(startOffsetPara), Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
} catch (gate.util.InvalidOffsetException ioe) {
throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
}
// End try
} else {
// Go to state 2 an keep reading chars
state = 2;
}
// End if
}
break;
}
// End switch
// Prepare to read the next char.
index++;
}
// End while
endOffsetPara = index;
// Investigate where the finite automata has stoped
if (state == 2 || state == 3) {
// Create an annotation type paragraph
try {
annotSet.add(Long.valueOf(startOffsetPara), // Create the final annotation using the endOffset
Long.valueOf(endOffsetPara), "paragraph", Factory.newFeatureMap());
} catch (gate.util.InvalidOffsetException ioe) {
throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
}
// End try
}
// End if
}
use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class UimaDocumentFormat method unpackCasMarkup.
/**
* Convert UIMA CAS markups to GATE markups.
* @param doc XML document already parsed
* @throws DocumentFormatException error when parsing the file
*/
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
AnnotationSet inputAS = doc.getAnnotations("Original markups");
AnnotationSet outputAS = doc.getAnnotations("Original markups");
// set format specific names
String casPrefix;
String idName;
if (!inputAS.get("CAS").isEmpty()) {
casPrefix = "uima.cas.";
idName = "_id";
} else if (!inputAS.get("xmi:XMI").isEmpty()) {
casPrefix = "cas:";
idName = "xmi:id";
} else {
throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
}
// get array/list contained elements annotations
for (Annotation annotation : inputAS) {
if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
try {
String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
// add contained values as a feature to the array annotation
if (!elements.trim().equals("")) {
annotation.getFeatures().put("elements", elements);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
}
}
}
// get document content from SOFA annotations
Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
if (sofaSet.size() > 1) {
Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
}
StringBuilder documentContent = new StringBuilder();
for (Annotation annotation : sofaSet) {
documentContent.append((String) annotation.getFeatures().get("sofaString"));
}
doc.setContent(new DocumentContentImpl(documentContent.toString()));
// remove SOFA annotations
inputAS.removeAll(sofaSet);
// remove non document annotations
inputAS.removeAll(inputAS.get("CAS"));
inputAS.removeAll(inputAS.get("xmi:XMI"));
inputAS.removeAll(inputAS.get("cas:NULL"));
// get the views members, views will be added later as annotation sets
List<List<String>> viewList = new ArrayList<List<String>>();
for (Annotation view : inputAS.get(casPrefix + "View")) {
viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
}
inputAS.removeAll(inputAS.get(casPrefix + "View"));
// fill a map with the id as key and the entity name as value
// this is specific to the Temis Luxid CAS format
Map<String, String> entityMap = new HashMap<String, String>();
for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
FeatureMap features = entity.getFeatures();
entityMap.put((String) features.get(idName), (String) features.get("value"));
}
try {
// for each UIMA annotation
for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
FeatureMap features = Factory.newFeatureMap();
features.putAll(annotation.getFeatures());
String start = (String) features.get("begin");
String end = (String) features.get("end");
String id = (String) features.get(idName);
// UIMA feature
features.remove("begin");
// UIMA feature
features.remove("end");
// GATE feature
features.remove("isEmptyAndSpan");
// UIMA XCAS feature
features.remove("_indexed");
if (start == null || end == null) {
// no offsets so add it as a GATE document feature
features.remove(idName);
for (Map.Entry<Object, Object> entry : features.entrySet()) {
doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
}
} else {
// offsets so add it as a GATE document annotation
String entityReference = (String) features.get("_ref_entity");
String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
int viewCount = 0;
for (List<String> viewMembers : viewList) {
if (viewMembers.contains(id)) {
// add the annotation to the annotation set
doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
}
viewCount++;
}
}
// delete UIMA annotation
inputAS.remove(annotation);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException("Couldn't create annotation.", e);
}
}
Aggregations