use of org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData in project stanbol by apache.
the class LinkableTokenFilter method reduce.
@Override
public void reduce(TagLL[] head) {
//this implements a two phase reduce
//(1) reduce Tags with no linkable tokens and not matching enough of the
// current chunk.
//(2) reduce remaining Tags in the cluster similar to TagClusterReducer
// but only considering the "matchable span" of the Tags. Meaning the
// span over matchable Tokens and not the full Text.
//this map holds the matchable spans for Tags. Filled during phase (1) and
//used for phase(2)
Map<TagLL, int[]> matchableTagSpan = new HashMap<TagLL, int[]>();
//(1) reduce Tags based on link-/matchable tokens as well as chunks.
LinkableTokenContext linkableTokenContext;
for (TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
while (linkableTokenContext != null && linkableTokenContext.linkableToken.token.getEnd() <= start) {
linkableTokens.remove(0);
linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
}
if (linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end) {
//does not overlap any linkable token
//remove the tag from the cluster
tag.removeLL();
if (log.isTraceEnabled()) {
CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
}
} else {
//if the tag overlaps a linkable token
TokenData linkableToken = linkableTokenContext.linkableToken;
List<TokenData> tokens = linkableTokenContext.context;
//calculate the matchable start/end span of the current TagLL
int[] mSpan = new int[] { Math.max(start, linkableToken.token.getStart()), Math.min(end, linkableToken.token.getEnd()) };
if (mSpan[0] > start) {
for (int i = linkableToken.index - 1; i >= 0; i--) {
TokenData token = tokens.get(i);
int tStart = token.token.getStart();
if (tStart < start) {
break;
} else if (token.isMatchable) {
mSpan[0] = tStart;
}
}
}
if (mSpan[1] < end) {
for (int i = linkableToken.index + 1; i < tokens.size(); i++) {
TokenData token = tokens.get(i);
int tEnd = token.token.getEnd();
if (tEnd > end) {
break;
} else if (token.isMatchable) {
mSpan[1] = tEnd;
}
}
}
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" - matchable Span {}{} for Tag {}[{},{}]", new Object[] { text.subSequence(mSpan[0], mSpan[1]), Arrays.toString(mSpan), text.subSequence(start, end), start, end });
}
matchableTagSpan.put(tag, mSpan);
//check if it matches > 50% of the chunk
ChunkData cd = linkableToken.inChunk;
if (!lpc.isIgnoreChunks() && cd != null && cd.isProcessable) {
int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() : start;
int cend = cd.getMatchableEndChar();
if (cstart < start || cend > end) {
//if the tag does not cover the whole chunk
int num = 0;
int match = 0;
for (int i = cd.getMatchableStart(); i <= cd.getMatchableEnd(); i++) {
TokenData td = tokens.get(i);
if (td.isMatchable) {
num++;
if (match < 1 && td.token.getStart() >= start || match > 0 && td.token.getEnd() <= end) {
match++;
}
}
}
//tokens in the Chunk are matched!
if (((float) match / (float) num) < minChunkMatchScore && match < minFoundTokens) {
//ignore
tag.removeLL();
matchableTagSpan.remove(tag);
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" - reduce tag {}[{},{}] - does only match " + "{} of {} of matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
}
} else if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" + keep tag {}[{},{}] - matches {} of {} " + "matchable Tokens for matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
}
} else {
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, text.subSequence(cstart, cend), cstart, cend });
}
}
} else if (log.isTraceEnabled()) {
CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" + keep tag {} - not in processable chunk", tagSequence);
}
}
}
// spans
if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) {
//no tag left from phase one or single token optimization
return;
}
//can not use TagLL#mark
Set<TagLL> marked = new HashSet<TagLL>();
while (true) {
// --Find longest not already marked
TagLL longest = null;
int longestMCharLen = -1;
int[] longestMSpan = null;
for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
int[] mSpan = matchableTagSpan.get(t);
int mCharLen = mSpan[1] - mSpan[0];
if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) {
longest = t;
longestMSpan = mSpan;
longestMCharLen = mCharLen;
}
}
if (longest == null)
break;
// --Mark longest (so we return it eventually)
marked.add(longest);
// --Remove tags overlapping this longest
for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
if (marked.contains(t)) {
continue;
}
int[] mSpan = matchableTagSpan.get(t);
boolean overlaps = mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1];
if (overlaps) {
t.removeLL();
} else if (mSpan[0] >= longestMSpan[1]) {
// no subsequent can possibly overlap
break;
}
}
}
// loop
}