Search in sources :

Example 1 with TagLL

use of org.opensextant.solrtexttagger.TagLL in project stanbol by apache.

the class LinkableTokenFilter method reduce.

@Override
public void reduce(TagLL[] head) {
    //this implements a two phase reduce
    //(1) reduce Tags with no linkable tokens and not matching enough of the
    //    current chunk.
    //(2) reduce remaining Tags in the cluster similar to TagClusterReducer
    //    but only considering the "matchable span" of the Tags. Meaning the
    //    span over matchable Tokens and not the full Text.
    //this map holds the matchable spans for Tags. Filled during phase (1) and
    //used for phase(2)
    Map<TagLL, int[]> matchableTagSpan = new HashMap<TagLL, int[]>();
    //(1) reduce Tags based on link-/matchable tokens as well as chunks. 
    LinkableTokenContext linkableTokenContext;
    for (TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
        int start = tag.getStartOffset();
        int end = tag.getEndOffset();
        linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
        while (linkableTokenContext != null && linkableTokenContext.linkableToken.token.getEnd() <= start) {
            linkableTokens.remove(0);
            linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0);
        }
        if (linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end) {
            //does not overlap any linkable token
            //remove the tag from the cluster
            tag.removeLL();
            if (log.isTraceEnabled()) {
                CharSequence tagSequence = at.getText().subSequence(start, end);
                log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
            }
        } else {
            //if the tag overlaps a linkable token 
            TokenData linkableToken = linkableTokenContext.linkableToken;
            List<TokenData> tokens = linkableTokenContext.context;
            //calculate the matchable start/end span of the current TagLL
            int[] mSpan = new int[] { Math.max(start, linkableToken.token.getStart()), Math.min(end, linkableToken.token.getEnd()) };
            if (mSpan[0] > start) {
                for (int i = linkableToken.index - 1; i >= 0; i--) {
                    TokenData token = tokens.get(i);
                    int tStart = token.token.getStart();
                    if (tStart < start) {
                        break;
                    } else if (token.isMatchable) {
                        mSpan[0] = tStart;
                    }
                }
            }
            if (mSpan[1] < end) {
                for (int i = linkableToken.index + 1; i < tokens.size(); i++) {
                    TokenData token = tokens.get(i);
                    int tEnd = token.token.getEnd();
                    if (tEnd > end) {
                        break;
                    } else if (token.isMatchable) {
                        mSpan[1] = tEnd;
                    }
                }
            }
            if (log.isTraceEnabled()) {
                CharSequence text = at.getText();
                log.trace(" - matchable Span {}{} for Tag {}[{},{}]", new Object[] { text.subSequence(mSpan[0], mSpan[1]), Arrays.toString(mSpan), text.subSequence(start, end), start, end });
            }
            matchableTagSpan.put(tag, mSpan);
            //check if it matches > 50% of the chunk
            ChunkData cd = linkableToken.inChunk;
            if (!lpc.isIgnoreChunks() && cd != null && cd.isProcessable) {
                int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() : start;
                int cend = cd.getMatchableEndChar();
                if (cstart < start || cend > end) {
                    //if the tag does not cover the whole chunk
                    int num = 0;
                    int match = 0;
                    for (int i = cd.getMatchableStart(); i <= cd.getMatchableEnd(); i++) {
                        TokenData td = tokens.get(i);
                        if (td.isMatchable) {
                            num++;
                            if (match < 1 && td.token.getStart() >= start || match > 0 && td.token.getEnd() <= end) {
                                match++;
                            }
                        }
                    }
                    //tokens in the Chunk are matched!
                    if (((float) match / (float) num) < minChunkMatchScore && match < minFoundTokens) {
                        //ignore
                        tag.removeLL();
                        matchableTagSpan.remove(tag);
                        if (log.isTraceEnabled()) {
                            CharSequence text = at.getText();
                            log.trace(" - reduce tag {}[{},{}] - does only match " + "{} of {} of matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
                        }
                    } else if (log.isTraceEnabled()) {
                        CharSequence text = at.getText();
                        log.trace(" + keep tag {}[{},{}] - matches {} of {} " + "matchable Tokens for matchable Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, match, num, text.subSequence(cstart, cend), cstart, cend });
                    }
                } else {
                    if (log.isTraceEnabled()) {
                        CharSequence text = at.getText();
                        log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", new Object[] { text.subSequence(start, end), start, end, text.subSequence(cstart, cend), cstart, cend });
                    }
                }
            } else if (log.isTraceEnabled()) {
                CharSequence tagSequence = at.getText().subSequence(start, end);
                log.trace(" + keep tag {} - not in processable chunk", tagSequence);
            }
        }
    }
    //      spans
    if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) {
        //no tag left from phase one or single token optimization
        return;
    }
    //can not use TagLL#mark
    Set<TagLL> marked = new HashSet<TagLL>();
    while (true) {
        // --Find longest not already marked
        TagLL longest = null;
        int longestMCharLen = -1;
        int[] longestMSpan = null;
        for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
            int[] mSpan = matchableTagSpan.get(t);
            int mCharLen = mSpan[1] - mSpan[0];
            if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) {
                longest = t;
                longestMSpan = mSpan;
                longestMCharLen = mCharLen;
            }
        }
        if (longest == null)
            break;
        // --Mark longest (so we return it eventually)
        marked.add(longest);
        // --Remove tags overlapping this longest
        for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
            if (marked.contains(t)) {
                continue;
            }
            int[] mSpan = matchableTagSpan.get(t);
            boolean overlaps = mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1];
            if (overlaps) {
                t.removeLL();
            } else if (mSpan[0] >= longestMSpan[1]) {
                // no subsequent can possibly overlap
                break;
            }
        }
    }
// loop
}
Also used : ChunkData(org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData) HashMap(java.util.HashMap) TagLL(org.opensextant.solrtexttagger.TagLL) TokenData(org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData) HashSet(java.util.HashSet)

Example 2 with TagLL

use of org.opensextant.solrtexttagger.TagLL in project stanbol by apache.

the class NamedEntityTokenFilter method reduce.

@Override
public void reduce(TagLL[] head) {
    //(1) reduce Tags based on named entity phrases. 
    for (TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
        int start = tag.getStartOffset();
        int end = tag.getEndOffset();
        Chunk nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0);
        while (nePhrase != null && nePhrase.getEnd() <= start) {
            nePhrases.remove(0);
            nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0);
        }
        if (nePhrase == null || !(start <= nePhrase.getStart() && end >= nePhrase.getEnd())) {
            //does not cover any named entity phrase
            //remove the tag from the cluster
            tag.removeLL();
            if (log.isTraceEnabled()) {
                log.trace(" > reduce tag {} - does not cover {}", tag, nePhrase);
            }
        } else if (log.isTraceEnabled()) {
            //the current Tag coveres a named entity phrase
            log.trace(" > keep tag {} for {}", tag, nePhrase);
        }
    }
}
Also used : TagLL(org.opensextant.solrtexttagger.TagLL) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk)

Aggregations

TagLL (org.opensextant.solrtexttagger.TagLL)2 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 ChunkData (org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData)1 TokenData (org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData)1 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)1