use of gate.util.OffsetComparator in project gate-core by GateNLP.
the class LuceneDocument method getTokens.
/**
* This method given a GATE document and other required parameters, for each
* annotation of type indexUnitAnnotationType creates a separate list of
* baseTokens underlying in it.
*/
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
boolean excludeFeatures = false;
boolean includeFeatures = false;
// features
if (!featuresToInclude.isEmpty()) {
includeFeatures = true;
} else if (!featuresToExclude.isEmpty()) {
excludeFeatures = true;
}
HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
// the index Unit Annotation Type is not specified
// therefore we consider the entire document as a single unit
OffsetGroup group = new OffsetGroup();
group.startOffset = 0L;
group.endOffset = document.getContent().size();
unitOffsetsSet.add(group);
} else {
Iterator<Annotation> iter = indexUnitSet.iterator();
while (iter.hasNext()) {
Annotation annotation = iter.next();
OffsetGroup group = new OffsetGroup();
group.startOffset = annotation.getStartNode().getOffset();
group.endOffset = annotation.getEndNode().getOffset();
unitOffsetsSet.add(group);
}
}
Set<String> allTypes = new HashSet<String>();
for (String aType : inputAs.getAllTypes()) {
if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
continue;
}
allTypes.add(aType);
}
if (baseTokenSet != null && baseTokenSet.size() > 0) {
allTypes.remove(baseTokenAnnotationType);
}
if (indexUnitSet != null && indexUnitSet.size() > 0)
allTypes.remove(indexUnitAnnotationType);
AnnotationSet toUseSet = new AnnotationSetImpl(document);
for (String type : allTypes) {
for (Annotation a : inputAs.get(type)) {
try {
toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
} catch (InvalidOffsetException ioe) {
throw new GateRuntimeException(ioe);
}
}
}
@SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
int counter = 0;
while (iter.hasNext()) {
OffsetGroup group = iter.next();
List<Token> newTokens = new ArrayList<Token>();
List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
// add tokens from the baseTokenSet
if (baseTokenSet != null && baseTokenSet.size() != 0) {
tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
}
if (tokens.isEmpty())
return null;
Collections.sort(tokens, new OffsetComparator());
int position = -1;
for (int i = 0; i < tokens.size(); i++) {
byte inc = 1;
Annotation annot = tokens.get(i);
String type = annot.getType();
// if the feature is specified in featuresToExclude -exclude it
if (excludeFeatures && featuresToExclude.contains(type))
continue;
// exclude it
if (includeFeatures && !featuresToInclude.contains(type))
continue;
int startOffset = annot.getStartNode().getOffset().intValue();
int endOffset = annot.getEndNode().getOffset().intValue();
String text = document.getContent().toString().substring(startOffset, endOffset);
Token token1 = new Token(type, startOffset, endOffset, "*");
// we add extra info of position
if (i > 0) {
if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
token1.setPositionIncrement(0);
inc = 0;
}
}
position += inc;
token1.setPosition(position);
newTokens.add(token1);
if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
// we need to create one string feature for this
Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
indexedFeatures.add(type + ".string");
tk1.setPositionIncrement(0);
tk1.setPosition(position);
newTokens.add(tk1);
}
// now find out the features and add them
FeatureMap features = annot.getFeatures();
Iterator<Object> fIter = features.keySet().iterator();
while (fIter.hasNext()) {
String type1 = fIter.next().toString();
// it
if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
continue;
}
// exclude it
if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
continue;
Object tempText = features.get(type1);
if (tempText == null)
continue;
String text1 = tempText.toString();
// we need to qualify the type names
// for each annotation type feature we add AT.Feature=="**" to be able
// to search for it
// to calculate stats
Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
indexedFeatures.add(type + "." + type1);
tempToken.setPositionIncrement(0);
tempToken.setPosition(position);
newTokens.add(tempToken);
Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
onlyATFeature.setPosition(position);
onlyATFeature.setPositionIncrement(0);
newTokens.add(onlyATFeature);
}
}
toReturn[counter] = newTokens;
counter++;
}
return toReturn;
}
Aggregations