use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class Annotandum method unpackMarkup.
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
setNewLineProperty(doc);
String[] lines = doc.getContent().toString().split("[\\n\\r]+");
StringBuilder newContent = new StringBuilder();
// Items of data to be turned into Original markups annotations
List<Annotandum> annotanda = new ArrayList<Annotandum>();
// Currently open tags: created by "B-FOO", extended by "I-FOO", closed
// by "O" or end of sentence.
Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
/* Note: I-Foo handling currently has a weak spot.
*
* this B-Foo
* is B-Bar
* strange I-Foo
*
* will result in a Foo annotation spanning "this is strange", because
* the I-Foo extends the existing B-Foo. If the sentence is cut off
* before hitting another I-Foo, however, the Foo annotation will not
* have been extended. But this situation will not occur in carefully
* edited input.
*/
long oldEnd = 0L;
long start = 0L;
long end = 0L;
for (String line : lines) {
oldEnd = end;
start = newContent.length();
String[] items = line.split("\\s+");
// any annotations in progress
if (items.length == 0) {
newContent.append("\n");
end = newContent.length();
finishAllTags(inProgress, annotanda, oldEnd);
} else {
String token = items[0];
// We've agreed to put the space after every token.
newContent.append(token);
end = newContent.length();
newContent.append(' ');
// Create Token and following SpaceToken annotation.
annotanda.add(Annotandum.makeToken(start, end, token));
annotanda.add(Annotandum.makeSpaceToken(end));
for (int column = 1; column < items.length; column++) {
// O means close all annotations in progress
if (items[column].equals("O")) {
finishAllTags(inProgress, annotanda, oldEnd);
} else // annotation, after closing any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("U-")) {
String type = items[column].substring(2);
finishTag(type, inProgress, annotanda, oldEnd);
annotanda.add(new Annotandum(type, start, end, column, true));
} else // close any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("L-")) {
String type = items[column].substring(2);
if (inProgress.containsKey(type)) {
// good L-FOO, so update the end offset
inProgress.get(type).endOffset = end;
} else {
// bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
inProgress.put(type, new Annotandum(type, start, end, column, true));
}
finishTag(type, inProgress, annotanda, end);
} else // after closing any "FOO" already in progress
if ((items[column].length() > 2) && items[column].startsWith("B-")) {
String type = items[column].substring(2);
finishTag(type, inProgress, annotanda, oldEnd);
inProgress.put(type, new Annotandum(type, start, end, column, true));
} else // "I-FOO": extend current "FOO" annotation
if ((items[column].length() > 2) && items[column].startsWith("I-")) {
String type = items[column].substring(2);
if (inProgress.containsKey(type)) {
// good I-FOO, so update the end offset
inProgress.get(type).endOffset = end;
} else {
// bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
inProgress.put(type, new Annotandum(type, start, end, column, true));
}
} else // "FOO": treat as single-token annotation (such as POS tag)
{
Annotandum tag = new Annotandum(items[column], start, end, column, false);
annotanda.add(tag);
}
}
}
}
// end of input: close any remaining annotations
finishAllTags(inProgress, annotanda, end);
// set new content & create Original markups annotations
try {
DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
doc.edit(0L, doc.getContent().size(), newContentImpl);
long newSize = doc.getContent().size();
AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
for (Annotandum ann : annotanda) {
if (DEBUG) {
String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
System.out.format("%d %d %s %s\n", ann.startOffset, ann.endOffset, ann.type, string);
}
originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
}
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class AnnotationSetImpl method getNodes.
/**
* Returns the nodes corresponding to the Longs. The Nodes are created if
* they don't exist.
*/
private final Node[] getNodes(Long start, Long end) throws InvalidOffsetException {
// are the offsets valid?
if (!doc.isValidOffsetRange(start, end)) {
throw new InvalidOffsetException("Offsets [" + start + ":" + end + "] not valid for this document of size " + doc.getContent().size());
}
// to find out if nodes need creating or if they exist already
if (nodesByOffset == null) {
indexByStartOffset();
}
// find existing nodes if appropriate nodes don't already exist,
// create them
Node startNode = nodesByOffset.get(start);
if (startNode == null)
startNode = new NodeImpl(doc.getNextNodeId(), start);
Node endNode = null;
if (start.equals(end)) {
endNode = startNode;
return new Node[] { startNode, endNode };
}
endNode = nodesByOffset.get(end);
if (endNode == null)
endNode = new NodeImpl(doc.getNextNodeId(), end);
return new Node[] { startNode, endNode };
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class NekoHtmlDocumentHandler method endDocument.
/**
* Called when the parser reaches the end of the document. Here we
* store the new content and construct the Original markups
* annotations.
*/
@Override
public void endDocument(Augmentations augs) throws XNIException {
if (DEBUG_GENERAL) {
Out.println("endDocument");
}
CustomObject obj = null;
// replace the old content with the new one
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// set from this gate document
if (basicAS == null)
basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// sort colector ascending on its id
Collections.sort(colector);
// iterate through colector and construct annotations
while (!colector.isEmpty()) {
obj = colector.getFirst();
colector.remove(obj);
// Construct an annotation from this obj
try {
basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
} catch (InvalidOffsetException e) {
Err.prln("Error creating an annot :" + obj + " Discarded...");
}
// end try
// }// end if
}
// while
// notify the listener about the total amount of elements that
// has been processed
fireStatusChangedEvent("Total elements : " + elements);
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class LuceneDocument method createTokens.
private boolean createTokens(gate.Document gateDocument, AnnotationSet set) {
String gateContent = gateDocument.getContent().toString();
int start = -1;
for (int i = 0; i < gateContent.length(); i++) {
char c = gateContent.charAt(i);
if (Character.isWhitespace(c)) {
if (start != -1) {
FeatureMap features = gate.Factory.newFeatureMap();
String string = gateContent.substring(start, i);
if (string.trim().length() > 0) {
features.put("string", string);
try {
set.add(Long.valueOf(start), Long.valueOf(i), Constants.ANNIC_TOKEN, features);
} catch (InvalidOffsetException ioe) {
ioe.printStackTrace();
return false;
}
}
start = i + 1;
}
} else {
if (start == -1)
start = i;
}
}
if (start == -1)
return false;
if (start < gateContent.length()) {
FeatureMap features = gate.Factory.newFeatureMap();
String string = gateContent.substring(start, gateContent.length());
if (string.trim().length() > 0) {
features.put("string", string);
try {
set.add(Long.valueOf(start), Long.valueOf(gateContent.length()), Constants.ANNIC_TOKEN, features);
} catch (InvalidOffsetException ioe) {
ioe.printStackTrace();
return false;
}
}
}
return true;
}
use of gate.util.InvalidOffsetException in project gate-core by GateNLP.
the class LuceneDocument method getTokens.
/**
* This method given a GATE document and other required parameters, for each
* annotation of type indexUnitAnnotationType creates a separate list of
* baseTokens underlying in it.
*/
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
boolean excludeFeatures = false;
boolean includeFeatures = false;
// features
if (!featuresToInclude.isEmpty()) {
includeFeatures = true;
} else if (!featuresToExclude.isEmpty()) {
excludeFeatures = true;
}
HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
// the index Unit Annotation Type is not specified
// therefore we consider the entire document as a single unit
OffsetGroup group = new OffsetGroup();
group.startOffset = 0L;
group.endOffset = document.getContent().size();
unitOffsetsSet.add(group);
} else {
Iterator<Annotation> iter = indexUnitSet.iterator();
while (iter.hasNext()) {
Annotation annotation = iter.next();
OffsetGroup group = new OffsetGroup();
group.startOffset = annotation.getStartNode().getOffset();
group.endOffset = annotation.getEndNode().getOffset();
unitOffsetsSet.add(group);
}
}
Set<String> allTypes = new HashSet<String>();
for (String aType : inputAs.getAllTypes()) {
if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
continue;
}
allTypes.add(aType);
}
if (baseTokenSet != null && baseTokenSet.size() > 0) {
allTypes.remove(baseTokenAnnotationType);
}
if (indexUnitSet != null && indexUnitSet.size() > 0)
allTypes.remove(indexUnitAnnotationType);
AnnotationSet toUseSet = new AnnotationSetImpl(document);
for (String type : allTypes) {
for (Annotation a : inputAs.get(type)) {
try {
toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
} catch (InvalidOffsetException ioe) {
throw new GateRuntimeException(ioe);
}
}
}
@SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
int counter = 0;
while (iter.hasNext()) {
OffsetGroup group = iter.next();
List<Token> newTokens = new ArrayList<Token>();
List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
// add tokens from the baseTokenSet
if (baseTokenSet != null && baseTokenSet.size() != 0) {
tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
}
if (tokens.isEmpty())
return null;
Collections.sort(tokens, new OffsetComparator());
int position = -1;
for (int i = 0; i < tokens.size(); i++) {
byte inc = 1;
Annotation annot = tokens.get(i);
String type = annot.getType();
// if the feature is specified in featuresToExclude -exclude it
if (excludeFeatures && featuresToExclude.contains(type))
continue;
// exclude it
if (includeFeatures && !featuresToInclude.contains(type))
continue;
int startOffset = annot.getStartNode().getOffset().intValue();
int endOffset = annot.getEndNode().getOffset().intValue();
String text = document.getContent().toString().substring(startOffset, endOffset);
Token token1 = new Token(type, startOffset, endOffset, "*");
// we add extra info of position
if (i > 0) {
if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
token1.setPositionIncrement(0);
inc = 0;
}
}
position += inc;
token1.setPosition(position);
newTokens.add(token1);
if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
// we need to create one string feature for this
Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
indexedFeatures.add(type + ".string");
tk1.setPositionIncrement(0);
tk1.setPosition(position);
newTokens.add(tk1);
}
// now find out the features and add them
FeatureMap features = annot.getFeatures();
Iterator<Object> fIter = features.keySet().iterator();
while (fIter.hasNext()) {
String type1 = fIter.next().toString();
// it
if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
continue;
}
// exclude it
if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
continue;
Object tempText = features.get(type1);
if (tempText == null)
continue;
String text1 = tempText.toString();
// we need to qualify the type names
// for each annotation type feature we add AT.Feature=="**" to be able
// to search for it
// to calculate stats
Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
indexedFeatures.add(type + "." + type1);
tempToken.setPositionIncrement(0);
tempToken.setPosition(position);
newTokens.add(tempToken);
Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
onlyATFeature.setPosition(position);
onlyATFeature.setPositionIncrement(0);
newTokens.add(onlyATFeature);
}
}
toReturn[counter] = newTokens;
counter++;
}
return toReturn;
}
Aggregations