use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.
the class LinkableTokenFilter method reduce.
@Override
public void reduce(TagLL[] head) {
//this implements a two phase reduce
//(1) reduce Tags with no linkable tokens and not matching enough of the
// current chunk.
//(2) reduce remaining Tags in the cluster similar to TagClusterReducer
// but only considering the "matchable span" of the Tags. Meaning the
// span over matchable Tokens and not the full Text.
//this map holds the matchable spans for Tags. Filled during phase (1) and
//used for phase(2)
Map<TagLL, int[]> matchableTagSpan = new HashMap<TagLL, int[]>();
//(1) reduce Tags based on link-/matchable tokens as well as chunks.
LinkableTokenContext linkableTokenContext;
for (TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
while (linkableTokenContext != null && linkableTokenContext.linkableToken.token.getEnd() <= start) {
linkableTokens.remove(0);
linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
}
if (linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end) {
//does not overlap any linkable token
//remove the tag from the cluster
tag.removeLL();
if (log.isTraceEnabled()) {
CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
}
} else {
//if the tag overlaps a linkable token
TokenData linkableToken = linkableTokenContext.linkableToken;
List<TokenData> tokens = linkableTokenContext.context;
//calculate the matchable start/end span of the current TagLL
int[] mSpan = new int[] { Math.max(start, linkableToken.token.getStart()), Math.min(end, linkableToken.token.getEnd()) };
if (mSpan[0] > start) {
for (int i = linkableToken.index - 1; i >= 0; i--) {
TokenData token = tokens.get(i);
int tStart = token.token.getStart();
if (tStart < start) {
break;
} else if (token.isMatchable) {
mSpan[0] = tStart;
}
}
}
if (mSpan[1] < end) {
for (int i = linkableToken.index + 1; i < tokens.size(); i++) {
TokenData token = tokens.get(i);
int tEnd = token.token.getEnd();
if (tEnd > end) {
break;
} else if (token.isMatchable) {
mSpan[1] = tEnd;
}
}
}
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" - matchable Span {}{} for Tag {}[{},{}]", new Object[] { text.subSequence(mSpan[0], mSpan[1]), Arrays.toString(mSpan), text.subSequence(start, end), start, end });
}
matchableTagSpan.put(tag, mSpan);
//check if it matches > 50% of the chunk
ChunkData cd = linkableToken.inChunk;
if (!lpc.isIgnoreChunks() && cd != null && cd.isProcessable) {
int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() : start;
int cend = cd.getMatchableEndChar();
if (cstart < start || cend > end) {
//if the tag does not cover the whole chunk
int num = 0;
int match = 0;
for (int i = cd.getMatchableStart(); i <= cd.getMatchableEnd(); i++) {
TokenData td = tokens.get(i);
if (td.isMatchable) {
num++;
if (match < 1 && td.token.getStart() >= start || match > 0 && td.token.getEnd() <= end) {
match++;
}
}
}
//tokens in the Chunk are matched!
if (((float) match / (float) num) < minChunkMatchScore && match < minFoundTokens) {
//ignore
tag.removeLL();
matchableTagSpan.remove(tag);
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" - reduce tag {}[{},{}] - does only match " + "{} of {} of matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
}
} else if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" + keep tag {}[{},{}] - matches {} of {} " + "matchable Tokens for matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
}
} else {
if (log.isTraceEnabled()) {
CharSequence text = at.getText();
log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, text.subSequence(cstart, cend), cstart, cend });
}
}
} else if (log.isTraceEnabled()) {
CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" + keep tag {} - not in processable chunk", tagSequence);
}
}
}
// spans
if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) {
//no tag left from phase one or single token optimization
return;
}
//can not use TagLL#mark
Set<TagLL> marked = new HashSet<TagLL>();
while (true) {
// --Find longest not already marked
TagLL longest = null;
int longestMCharLen = -1;
int[] longestMSpan = null;
for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
int[] mSpan = matchableTagSpan.get(t);
int mCharLen = mSpan[1] - mSpan[0];
if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) {
longest = t;
longestMSpan = mSpan;
longestMCharLen = mCharLen;
}
}
if (longest == null)
break;
// --Mark longest (so we return it eventually)
marked.add(longest);
// --Remove tags overlapping this longest
for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
if (marked.contains(t)) {
continue;
}
int[] mSpan = matchableTagSpan.get(t);
boolean overlaps = mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1];
if (overlaps) {
t.removeLL();
} else if (mSpan[0] >= longestMSpan[1]) {
// no subsequent can possibly overlap
break;
}
}
}
// loop
}
use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.
the class LinkableTokenFilter method nextToken.
/**
* Iterating over TokensData requires to iterate over two hierarchy levels:
* (1) sections (likely Sentences) and (2) Tokens <p>
* <b>NOTE</b> that this method modifies a lot of fields to update the
* state of the iteration accordingly. If the {@link #token} field is
* <code>null</code> after a call to this method this indicates that the
* end of the {@link Token} in the {@link AnalysedText} was reached.
* @param first is this the first call for the current {@link #offset} state?
* @return the token or <code>null</code> if there are no more tokens for
* the current {@link #offset}
*/
private TokenData nextToken(boolean first) {
int startOffset = offset.startOffset();
int endOffset = offset.endOffset();
if (first) {
//on the first call for a token
//reset cursor to zero
tokensCursor = -1;
while (!tokens.isEmpty()) {
//remove tokens earlier as the current offset
if (tokens.get(0).token.getEnd() <= startOffset) {
tokens.remove(0);
} else {
//stop on the first overlapping token
break;
}
}
//else nothing to do
}
if (tokensCursor >= tokens.size() - 1) {
if (!incrementTokenData()) {
//EoF
return null;
}
}
TokenData cursorToken = tokens.get(tokensCursor + 1);
if (cursorToken.token.getStart() < endOffset) {
//set the next token as current
tokensCursor++;
//and return it
return cursorToken;
} else {
return null;
}
}
use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.
the class LinkableTokenFilter method incrementToken.
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
incrementCount++;
boolean first = true;
TokenData token;
boolean lookup = false;
int lastMatchable = -1;
int lastIndex = -1;
log.trace("> solr:[{},{}] {}", new Object[] { offset.startOffset(), offset.endOffset(), termAtt });
while ((token = nextToken(first)) != null) {
log.trace(" < [{},{}]:{} (link {}, match; {})", new Object[] { token.token.getStart(), token.token.getEnd(), token.getTokenText(), token.isLinkable, token.isMatchable });
first = false;
if (token.isLinkable) {
log.trace(" + lookup because {} is linkable", token);
lookup = true;
} else if (token.isMatchable) {
lastMatchable = token.index;
lastIndex = lastMatchable;
}
//multiple matchable tokens.
if (!lookup && (!lpc.isIgnoreChunks()) && token.inChunk != null && token.inChunk.isProcessable) {
if (token.inChunk.isNamedEntity()) {
if (log.isTraceEnabled()) {
log.trace(" + lookup because {} is part of Named Entity '{}'", token.token, token.inChunk.chunk.getSpan());
}
lookup = true;
}
if (token.inChunk.hasLinkable() || (lpc.isLinkMultiMatchableTokensInChunk() && token.inChunk.getMatchableCount() > 1)) {
if (log.isTraceEnabled()) {
log.trace(" + lookup because {} is part of a linkable chunk '{}'", token.token, token.inChunk.chunk.getSpan());
}
lookup = true;
}
}
}
//lookahead
if (!lookup && lastIndex >= 0 && sectionData != null) {
List<TokenData> tokens = sectionData.getTokens();
int maxLookahead = Math.max(lastIndex, lastMatchable + 3);
for (int i = lastIndex + 1; !lookup && i < maxLookahead && i < tokens.size(); i++) {
token = tokens.get(i);
if (token.isLinkable) {
lookup = true;
} else if (token.isMatchable && (i + 1) == maxLookahead) {
//increase lookahead for matchable tokens
maxLookahead++;
}
}
}
this.taggable.setTaggable(lookup);
if (lookup) {
if (log.isTraceEnabled()) {
TokenData t = getToken();
log.trace("lookup: token [{},{}]: {} | word [{},{}]:{}", new Object[] { offset.startOffset(), offset.endOffset(), termAtt, t.token.getStart(), t.token.getEnd(), t.getTokenText() });
}
lookupCount++;
}
return true;
} else {
log.debug("lookup percentage: {}", lookupCount * 100 / (float) incrementCount);
return false;
}
}