Search in sources :

Example 71 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.

the class SmartcnSentenceEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    // first the sentences
    TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
    try {
        sentences.reset();
        while (sentences.incrementToken()) {
            OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
            Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
            if (log.isTraceEnabled()) {
                log.trace("detected {}:{}", s, s.getSpan());
            }
        }
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) TokenStream(org.apache.lucene.analysis.TokenStream) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) SentenceTokenizer(org.apache.lucene.analysis.cn.smart.SentenceTokenizer) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 72 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.

the class QueryUtils method parseWildcardQueryTerms.

/**
 * Parses query terms for Wildcard queries as described in the first
 * comment of STANBOL-607. <p>
 * As an example the String:
 * <code><pre>
 *     "This is a te?t for multi* Toke? Wildc\*adrd Se?rche*
 * </pre></code>
 * is converted in the query terms
 * <code><pre>
 *     ["This is a","te?t","multi*","toke?","Wildc\*adrd","se?rche*"]
 * </pre></code>
 * NOTE: that tokens that include are converted to lower case
 * @param value the value
 * @param loewercaseWildcardTokens if query elements that include a wildcard
 * should be converted to lower case.
 * @return the query terms
 * @throws IOException
 */
private static QueryTerm[] parseWildcardQueryTerms(String value, boolean loewercaseWildcardTokens) {
    // This assumes that the Tokenizer does tokenize '*' and '?',
    // what makes it a little bit tricky.
    Tokenizer tokenizer = new ICUTokenizer(new StringReader(value), tokenizerConfig);
    Matcher m = WILDCARD_QUERY_CHAR_PATTERN.matcher(value);
    int next = m.find() ? m.start() + 1 : -1;
    if (next < 0) {
        // No wildcard
        return new QueryTerm[] { new QueryTerm(value, false, true, true) };
    }
    ArrayList<QueryTerm> queryElements = new ArrayList<QueryTerm>(5);
    int lastAdded = -1;
    int lastOffset = 0;
    boolean foundWildcard = false;
    // Lucene tokenizer are really low level ...
    try {
        // starting with Solr4 reset MUST BE called before using
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            // only interested in the start/end indexes of tokens
            OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
            if (lastAdded < 0) {
                // rest with this token
                lastAdded = offset.startOffset();
            }
            if (foundWildcard) {
                // query term.
                if (offset.startOffset() > lastOffset + 1) {
                    // (1)
                    String queryElement = value.substring(lastAdded, lastOffset + 1);
                    if (loewercaseWildcardTokens) {
                        queryElement = queryElement.toLowerCase();
                    }
                    queryElements.add(new QueryTerm(queryElement, true, false, true));
                    // previous token consumed
                    lastAdded = offset.startOffset();
                    // set to the start of the current token
                    foundWildcard = false;
                } else if (next != offset.endOffset()) {
                    // (2)
                    String queryElement = value.substring(lastAdded, offset.endOffset());
                    if (loewercaseWildcardTokens) {
                        queryElement = queryElement.toLowerCase();
                    }
                    queryElements.add(new QueryTerm(queryElement, true, false, true));
                    // consume the current token
                    lastAdded = -1;
                    foundWildcard = false;
                }
            }
            if (next == offset.endOffset()) {
                // end of current token is '*' or '?'
                // search next '*', '?' in value
                next = m.find() ? m.start() + 1 : -1;
                // a single word
                if (!foundWildcard && lastAdded < lastOffset) {
                    String queryElement = value.substring(lastAdded, lastOffset);
                    queryElements.add(new QueryTerm(queryElement, false, true, true));
                    lastAdded = offset.startOffset();
                }
                // else multiple wildcards in a single token
                foundWildcard = true;
            }
            lastOffset = offset.endOffset();
        }
    } catch (IOException e) {
        // StringReader can not throw IOExceptions
        throw new IllegalStateException(e);
    }
    if (lastAdded >= 0 && lastAdded < value.length()) {
        String queryElement = value.substring(lastAdded, value.length());
        if (foundWildcard && loewercaseWildcardTokens) {
            queryElement = queryElement.toLowerCase();
        }
        if (foundWildcard) {
            queryElements.add(new QueryTerm(queryElement, true, false, true));
        } else {
            queryElements.add(new QueryTerm(queryElement, false, true, true));
        }
    }
    return queryElements.toArray(new QueryTerm[queryElements.size()]);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ICUTokenizer(org.apache.lucene.analysis.icu.segmentation.ICUTokenizer) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) ICUTokenizer(org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)

Example 73 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.

the class KuromojiNlpEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
        throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    // start with the Tokenizer
    TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    // build the analyzing chain by adding all TokenFilters
    for (TokenFilterFactory filterFactory : filterFactories) {
        tokenStream = filterFactory.create(tokenStream);
    }
    // Try to extract sentences based on POS tags ...
    int sentStartOffset = -1;
    // NER data
    List<NerData> nerList = new ArrayList<NerData>();
    // the next index where the NerData.context need to be set
    int nerSentIndex = 0;
    NerData ner = null;
    OffsetAttribute offset = null;
    try {
        // required with Solr 4
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            offset = tokenStream.addAttribute(OffsetAttribute.class);
            Token token = at.addToken(offset.startOffset(), offset.endOffset());
            // Get the POS attribute and init the PosTag
            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
            PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (posTag == null) {
                posTag = adhocTags.get(posAttr.getPartOfSpeech());
                if (posTag == null) {
                    posTag = new PosTag(posAttr.getPartOfSpeech());
                    adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                    log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
                }
            }
            // Sentence detection by POS tag
            if (sentStartOffset < 0) {
                // the last token was a sentence ending
                sentStartOffset = offset.startOffset();
            }
            if (posTag.hasPos(Pos.Point)) {
                Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                // add the sentence as context to the NerData instances
                while (nerSentIndex < nerList.size()) {
                    nerList.get(nerSentIndex).context = sent.getSpan();
                    nerSentIndex++;
                }
                sentStartOffset = -1;
            }
            // POS
            token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
            // NER
            NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
                // write NER annotation
                Chunk chunk = at.addChunk(ner.start, ner.end);
                chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                // NOTE that the fise:TextAnnotation are written later based on the nerList
                // clean up
                ner = null;
            }
            if (nerTag != null) {
                if (ner == null) {
                    ner = new NerData(nerTag, offset.startOffset());
                    nerList.add(ner);
                }
                ner.end = offset.endOffset();
            }
            BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
            MorphoFeatures morpho = null;
            if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
                morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                // and add the posTag
                morpho.addPos(posTag);
            }
            InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
            inflectionAttr.getInflectionForm();
            inflectionAttr.getInflectionType();
            if (morpho != null) {
                // if present add the morpho
                token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
            }
        }
        // we still need to write the last sentence
        Sentence lastSent = null;
        if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
            lastSent = at.addSentence(sentStartOffset, offset.endOffset());
        }
        // and set the context off remaining named entities
        while (nerSentIndex < nerList.size()) {
            if (lastSent != null) {
                nerList.get(nerSentIndex).context = lastSent.getSpan();
            } else {
                // no sentence detected
                nerList.get(nerSentIndex).context = at.getSpan();
            }
            nerSentIndex++;
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
        /* ignore */
        }
    }
    // finally write the NER annotations to the metadata of the ContentItem
    final Graph metadata = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        Language lang = new Language("ja");
        for (NerData nerData : nerList) {
            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
            metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) BaseFormAttribute(org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) InflectionAttribute(org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) PartOfSpeechAttribute(org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute) IOException(java.io.IOException) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Graph(org.apache.clerezza.commons.rdf.Graph) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 74 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project sukija by ahomansikka.

the class SuggestionFilter method filter.

@Override
protected Iterator<String> filter() {
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    if (LOG.isDebugEnabled())
        LOG.debug("Word-f A " + word + " " + termAtt.toString() + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + Constants.toString(flagsAtt));
    if (hasFlag(flagsAtt, LATEX_HYPHEN)) {
        word = word.replace("\\-", "");
    }
    final int n = word.lastIndexOf('-');
    if (n > 0) {
        Constants.addFlags(flagsAtt, Constants.COMPOUND_WORD);
    } else {
        Constants.removeFlags(flagsAtt, Constants.COMPOUND_WORD);
    }
    if (LOG.isDebugEnabled())
        LOG.debug("Word-f B " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt));
    if (hasFlag(flagsAtt, Constants.COMPOUND_WORD)) {
        if (AnalysisUtils.analyze(voikko, word, voikkoAtt, baseFormAtt, flagsAtt)) {
            return baseFormAtt.getBaseForms().iterator();
        } else {
            // 
            if (LOG.isDebugEnabled())
                LOG.debug("Word-f C " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt));
            final String START = word.substring(0, n);
            final String END = word.substring(n + 1);
            if (LOG.isDebugEnabled())
                LOG.debug("Word-f D " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt) + " [" + START + "] " + END);
            Set<String> baseForms = new HashSet<String>();
            Set<String> result = suggest(END);
            if (LOG.isDebugEnabled())
                LOG.debug("Word-f E " + END + " " + result.toString());
            if (result != null) {
                for (String u : result) {
                    baseForms.add(START + "-" + u);
                    baseForms.add(START + u);
                }
                return baseForms.iterator();
            }
        }
    } else {
        Set<String> s = suggest(word);
        if (s != null) {
            return s.iterator();
        }
    }
    return null;
}
Also used : OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashSet(java.util.HashSet)

Example 75 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project zm-mailbox by Zimbra.

the class UniversalAnalyzerTest method testCJK.

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);
    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5