Search in sources :

Example 1 with TokenData

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.

the class LinkableTokenFilter method reduce.

@Override
public void reduce(TagLL[] head) {
    //this implements a two phase reduce
    //(1) reduce Tags with no linkable tokens and not matching enough of the
    //    current chunk.
    //(2) reduce remaining Tags in the cluster similar to TagClusterReducer
    //    but only considering the "matchable span" of the Tags. Meaning the
    //    span over matchable Tokens and not the full Text.
    //this map holds the matchable spans for Tags. Filled during phase (1) and
    //used for phase(2)
    Map<TagLL, int[]> matchableTagSpan = new HashMap<TagLL, int[]>();
    //(1) reduce Tags based on link-/matchable tokens as well as chunks. 
    LinkableTokenContext linkableTokenContext;
    for (TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
        int start = tag.getStartOffset();
        int end = tag.getEndOffset();
        linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
        while (linkableTokenContext != null && linkableTokenContext.linkableToken.token.getEnd() <= start) {
            linkableTokens.remove(0);
            linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
        }
        if (linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end) {
            //does not overlap any linkable token
            //remove the tag from the cluster
            tag.removeLL();
            if (log.isTraceEnabled()) {
                CharSequence tagSequence = at.getText().subSequence(start, end);
                log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
            }
        } else {
            //if the tag overlaps a linkable token 
            TokenData linkableToken = linkableTokenContext.linkableToken;
            List<TokenData> tokens = linkableTokenContext.context;
            //calculate the matchable start/end span of the current TagLL
            int[] mSpan = new int[] { Math.max(start, linkableToken.token.getStart()), Math.min(end, linkableToken.token.getEnd()) };
            if (mSpan[0] > start) {
                for (int i = linkableToken.index - 1; i >= 0; i--) {
                    TokenData token = tokens.get(i);
                    int tStart = token.token.getStart();
                    if (tStart < start) {
                        break;
                    } else if (token.isMatchable) {
                        mSpan[0] = tStart;
                    }
                }
            }
            if (mSpan[1] < end) {
                for (int i = linkableToken.index + 1; i < tokens.size(); i++) {
                    TokenData token = tokens.get(i);
                    int tEnd = token.token.getEnd();
                    if (tEnd > end) {
                        break;
                    } else if (token.isMatchable) {
                        mSpan[1] = tEnd;
                    }
                }
            }
            if (log.isTraceEnabled()) {
                CharSequence text = at.getText();
                log.trace(" - matchable Span {}{} for Tag {}[{},{}]", new Object[] { text.subSequence(mSpan[0], mSpan[1]), Arrays.toString(mSpan), text.subSequence(start, end), start, end });
            }
            matchableTagSpan.put(tag, mSpan);
            //check if it matches > 50% of the chunk
            ChunkData cd = linkableToken.inChunk;
            if (!lpc.isIgnoreChunks() && cd != null && cd.isProcessable) {
                int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() : start;
                int cend = cd.getMatchableEndChar();
                if (cstart < start || cend > end) {
                    //if the tag does not cover the whole chunk
                    int num = 0;
                    int match = 0;
                    for (int i = cd.getMatchableStart(); i <= cd.getMatchableEnd(); i++) {
                        TokenData td = tokens.get(i);
                        if (td.isMatchable) {
                            num++;
                            if (match < 1 && td.token.getStart() >= start || match > 0 && td.token.getEnd() <= end) {
                                match++;
                            }
                        }
                    }
                    //tokens in the Chunk are matched!
                    if (((float) match / (float) num) < minChunkMatchScore && match < minFoundTokens) {
                        //ignore
                        tag.removeLL();
                        matchableTagSpan.remove(tag);
                        if (log.isTraceEnabled()) {
                            CharSequence text = at.getText();
                            log.trace(" - reduce tag {}[{},{}] - does only match " + "{} of {} of matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
                        }
                    } else if (log.isTraceEnabled()) {
                        CharSequence text = at.getText();
                        log.trace(" + keep tag {}[{},{}] - matches {} of {} " + "matchable Tokens for matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
                    }
                } else {
                    if (log.isTraceEnabled()) {
                        CharSequence text = at.getText();
                        log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, text.subSequence(cstart, cend), cstart, cend });
                    }
                }
            } else if (log.isTraceEnabled()) {
                CharSequence tagSequence = at.getText().subSequence(start, end);
                log.trace(" + keep tag {} - not in processable chunk", tagSequence);
            }
        }
    }
    //      spans
    if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) {
        //no tag left from phase one or single token optimization
        return;
    }
    //can not use TagLL#mark
    Set<TagLL> marked = new HashSet<TagLL>();
    while (true) {
        // --Find longest not already marked
        TagLL longest = null;
        int longestMCharLen = -1;
        int[] longestMSpan = null;
        for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
            int[] mSpan = matchableTagSpan.get(t);
            int mCharLen = mSpan[1] - mSpan[0];
            if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) {
                longest = t;
                longestMSpan = mSpan;
                longestMCharLen = mCharLen;
            }
        }
        if (longest == null)
            break;
        // --Mark longest (so we return it eventually)
        marked.add(longest);
        // --Remove tags overlapping this longest
        for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
            if (marked.contains(t)) {
                continue;
            }
            int[] mSpan = matchableTagSpan.get(t);
            boolean overlaps = mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1];
            if (overlaps) {
                t.removeLL();
            } else if (mSpan[0] >= longestMSpan[1]) {
                // no subsequent can possibly overlap
                break;
            }
        }
    }
// loop
}
Also used : ChunkData(org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData) HashMap(java.util.HashMap) TagLL(org.opensextant.solrtexttagger.TagLL) TokenData(org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData) HashSet(java.util.HashSet)

Example 2 with TokenData

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.

the class LinkableTokenFilter method nextToken.

/**
     * Iterating over TokensData requires to iterate over two hierarchy levels:
     * (1) sections (likely Sentences) and (2) Tokens <p>
     * <b>NOTE</b> that this method modifies a lot of fields to update the
     * state of the iteration accordingly. If the {@link #token} field is
     * <code>null</code> after a call to this method this indicates that the
     * end of the {@link Token} in the {@link AnalysedText} was reached.
     * @param first is this the first call for the current {@link #offset} state?
     * @return the token or <code>null</code> if there are no more tokens for
     * the current {@link #offset}
     */
private TokenData nextToken(boolean first) {
    int startOffset = offset.startOffset();
    int endOffset = offset.endOffset();
    if (first) {
        //on the first call for a token
        //reset cursor to zero
        tokensCursor = -1;
        while (!tokens.isEmpty()) {
            //remove tokens earlier as the current offset
            if (tokens.get(0).token.getEnd() <= startOffset) {
                tokens.remove(0);
            } else {
                //stop on the first overlapping token
                break;
            }
        }
    //else nothing to do
    }
    if (tokensCursor >= tokens.size() - 1) {
        if (!incrementTokenData()) {
            //EoF
            return null;
        }
    }
    TokenData cursorToken = tokens.get(tokensCursor + 1);
    if (cursorToken.token.getStart() < endOffset) {
        //set the next token as current
        tokensCursor++;
        //and return it
        return cursorToken;
    } else {
        return null;
    }
}
Also used : TokenData(org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData)

Example 3 with TokenData

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData in project stanbol by apache.

the class LinkableTokenFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
        incrementCount++;
        boolean first = true;
        TokenData token;
        boolean lookup = false;
        int lastMatchable = -1;
        int lastIndex = -1;
        log.trace("> solr:[{},{}] {}", new Object[] { offset.startOffset(), offset.endOffset(), termAtt });
        while ((token = nextToken(first)) != null) {
            log.trace("  < [{},{}]:{} (link {}, match; {})", new Object[] { token.token.getStart(), token.token.getEnd(), token.getTokenText(), token.isLinkable, token.isMatchable });
            first = false;
            if (token.isLinkable) {
                log.trace("  + lookup because {} is linkable", token);
                lookup = true;
            } else if (token.isMatchable) {
                lastMatchable = token.index;
                lastIndex = lastMatchable;
            }
            //multiple matchable tokens.
            if (!lookup && (!lpc.isIgnoreChunks()) && token.inChunk != null && token.inChunk.isProcessable) {
                if (token.inChunk.isNamedEntity()) {
                    if (log.isTraceEnabled()) {
                        log.trace("  + lookup because {} is part of Named Entity '{}'", token.token, token.inChunk.chunk.getSpan());
                    }
                    lookup = true;
                }
                if (token.inChunk.hasLinkable() || (lpc.isLinkMultiMatchableTokensInChunk() && token.inChunk.getMatchableCount() > 1)) {
                    if (log.isTraceEnabled()) {
                        log.trace("  + lookup because {} is part of a linkable chunk '{}'", token.token, token.inChunk.chunk.getSpan());
                    }
                    lookup = true;
                }
            }
        }
        //lookahead
        if (!lookup && lastIndex >= 0 && sectionData != null) {
            List<TokenData> tokens = sectionData.getTokens();
            int maxLookahead = Math.max(lastIndex, lastMatchable + 3);
            for (int i = lastIndex + 1; !lookup && i < maxLookahead && i < tokens.size(); i++) {
                token = tokens.get(i);
                if (token.isLinkable) {
                    lookup = true;
                } else if (token.isMatchable && (i + 1) == maxLookahead) {
                    //increase lookahead for matchable tokens
                    maxLookahead++;
                }
            }
        }
        this.taggable.setTaggable(lookup);
        if (lookup) {
            if (log.isTraceEnabled()) {
                TokenData t = getToken();
                log.trace("lookup: token [{},{}]: {} | word [{},{}]:{}", new Object[] { offset.startOffset(), offset.endOffset(), termAtt, t.token.getStart(), t.token.getEnd(), t.getTokenText() });
            }
            lookupCount++;
        }
        return true;
    } else {
        log.debug("lookup percentage: {}", lookupCount * 100 / (float) incrementCount);
        return false;
    }
}
Also used : TokenData(org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData)

Aggregations

TokenData (org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData)3 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 ChunkData (org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData)1 TagLL (org.opensextant.solrtexttagger.TagLL)1