Search in sources :

Example 16 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TestSnowball method testFilterTokens.

public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    filter.incrementToken();
    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 17 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenSourcesTest method testMaxStartOffsetConsistency.

public void testMaxStartOffsetConsistency() throws IOException {
    FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    tvFieldType.setStoreTermVectors(true);
    tvFieldType.setStoreTermVectorOffsets(true);
    tvFieldType.setStoreTermVectorPositions(true);
    Directory dir = newDirectory();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    //we don't necessarily consume the whole stream because of limiting by startOffset
    analyzer.setEnableChecks(false);
    Document doc = new Document();
    final String TEXT = " f gg h";
    doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
    doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
    IndexReader reader;
    try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
        writer.addDocument(doc);
        reader = writer.getReader();
    }
    try {
        Fields tvFields = reader.getTermVectors(0);
        for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
            TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
            TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
            //assert have same tokens, none of which has a start offset > maxStartOffset
            final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
            final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
            tvStream.reset();
            anaStream.reset();
            while (tvStream.incrementToken()) {
                assertTrue(anaStream.incrementToken());
                assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
                if (maxStartOffset >= 0)
                    assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
            }
            assertTrue(anaStream.incrementToken() == false);
            tvStream.end();
            anaStream.end();
            tvStream.close();
            anaStream.close();
        }
    } finally {
        reader.close();
    }
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Fields(org.apache.lucene.index.Fields) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TextField(org.apache.lucene.document.TextField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 18 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenSourcesTest method testPayloads.

// LUCENE-5294
public void testPayloads() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    myFieldType.setStoreTermVectors(true);
    myFieldType.setStoreTermVectorOffsets(true);
    myFieldType.setStoreTermVectorPositions(true);
    myFieldType.setStoreTermVectorPayloads(true);
    curOffset = 0;
    Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
    Document doc = new Document();
    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    assertEquals(1, reader.numDocs());
    TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
    }
    assertFalse(ts.incrementToken());
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 19 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenStreamToTermAutomatonQuery method toQuery.

/** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes (or Unicode code points 
   *  if unicodeArcs = true) from each term. */
public TermAutomatonQuery toQuery(String field, TokenStream in) throws IOException {
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
    in.reset();
    TermAutomatonQuery query = new TermAutomatonQuery(field);
    int pos = -1;
    int lastPos = 0;
    int maxOffset = 0;
    int maxPos = -1;
    int state = -1;
    while (in.incrementToken()) {
        int posInc = posIncAtt.getPositionIncrement();
        if (preservePositionIncrements == false && posInc > 1) {
            posInc = 1;
        }
        assert pos > -1 || posInc > 0;
        if (posInc > 1) {
            throw new IllegalArgumentException("cannot handle holes; to accept any term, use '*' term");
        }
        if (posInc > 0) {
            // New node:
            pos += posInc;
        }
        int endPos = pos + posLengthAtt.getPositionLength();
        while (state < endPos) {
            state = query.createState();
        }
        BytesRef term = termBytesAtt.getBytesRef();
        //System.out.println(pos + "-" + endPos + ": " + term.utf8ToString() + ": posInc=" + posInc);
        if (term.length == 1 && term.bytes[term.offset] == (byte) '*') {
            query.addAnyTransition(pos, endPos);
        } else {
            query.addTransition(pos, endPos, term);
        }
        maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
        maxPos = Math.max(maxPos, endPos);
    }
    in.end();
    // TODO: look at endOffset?  ts2a did...
    // TODO: this (setting "last" state as the only accept state) may be too simplistic?
    query.setAccept(state, true);
    query.finish();
    return query;
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 20 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class FreeTextSuggester method lookup.

/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        throw new IllegalStateException("Lookup not supported at this time");
    }
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
        //System.out.println("lookup: key='" + key + "'");
        // Run full analysis, but save only the
        // last 1gram, last 2gram, etc.:
        int maxEndOffset = -1;
        boolean sawRealToken = false;
        while (ts.incrementToken()) {
            BytesRef tokenBytes = termBytesAtt.getBytesRef();
            sawRealToken |= tokenBytes.length > 0;
            // TODO: this is somewhat iffy; today, ShingleFilter
            // sets posLen to the gram count; maybe we should make
            // a separate dedicated att for this?
            int gramCount = posLenAtt.getPositionLength();
            assert gramCount <= grams;
            // Safety: make sure the recalculated count "agrees":
            if (countGrams(tokenBytes) != gramCount) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
            }
            maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            BytesRefBuilder b = new BytesRefBuilder();
            b.append(tokenBytes);
            lastTokens[gramCount - 1] = b;
        }
        ts.end();
        if (!sawRealToken) {
            throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
        }
        // Carefully fill last tokens with _ tokens;
        // ShingleFilter appraently won't emit "only hole"
        // tokens:
        int endPosInc = posIncAtt.getPositionIncrement();
        // Note this will also be true if input is the empty
        // string (in which case we saw no tokens and
        // maxEndOffset is still -1), which in fact works out OK
        // because we fill the unigram with an empty BytesRef
        // below:
        boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
        if (lastTokenEnded) {
            // starting with "foo":
            for (int i = grams - 1; i > 0; i--) {
                BytesRefBuilder token = lastTokens[i - 1];
                if (token == null) {
                    continue;
                }
                token.append(separator);
                lastTokens[i] = token;
            }
            lastTokens[0] = new BytesRefBuilder();
        }
        Arc<Long> arc = new Arc<>();
        BytesReader bytesReader = fst.getBytesReader();
        // Try highest order models first, and if they return
        // results, return that; else, fallback:
        double backoff = 1.0;
        List<LookupResult> results = new ArrayList<>(num);
        // We only add a given suffix once, from the highest
        // order model that saw it; for subsequent lower order
        // models we skip it:
        final Set<BytesRef> seen = new HashSet<>();
        for (int gram = grams - 1; gram >= 0; gram--) {
            BytesRefBuilder token = lastTokens[gram];
            // Don't make unigram predictions from empty string:
            if (token == null || (token.length() == 0 && key.length() > 0)) {
                //System.out.println("  gram=" + gram + ": skip: not enough input");
                continue;
            }
            if (endPosInc > 0 && gram <= endPosInc) {
                //System.out.println("  break: only holes now");
                break;
            }
            //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
            // TODO: we could add fuzziness here
            // match the prefix portion exactly
            //Pair<Long,BytesRef> prefixOutput = null;
            Long prefixOutput = null;
            try {
                prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            if (prefixOutput == null) {
                // This model never saw this prefix, e.g. the
                // trigram model never saw context "purple mushroom"
                backoff *= ALPHA;
                continue;
            }
            // TODO: we could do this division at build time, and
            // bake it into the FST?
            // Denominator for computing scores from current
            // model's predictions:
            long contextCount = totTokens;
            BytesRef lastTokenFragment = null;
            for (int i = token.length() - 1; i >= 0; i--) {
                if (token.byteAt(i) == separator) {
                    BytesRef context = new BytesRef(token.bytes(), 0, i);
                    Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
                    assert output != null;
                    contextCount = decodeWeight(output);
                    lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                    break;
                }
            }
            final BytesRefBuilder finalLastToken = new BytesRefBuilder();
            if (lastTokenFragment == null) {
                finalLastToken.copyBytes(token.get());
            } else {
                finalLastToken.copyBytes(lastTokenFragment);
            }
            CharsRefBuilder spare = new CharsRefBuilder();
            // complete top-N
            TopResults<Long> completions = null;
            try {
                // Because we store multiple models in one FST
                // (1gram, 2gram, 3gram), we must restrict the
                // search so that it only considers the current
                // model.  For highest order model, this is not
                // necessary since all completions in the FST
                // must be from this model, but for lower order
                // models we have to filter out the higher order
                // ones:
                // Must do num+seen.size() for queue depth because we may
                // reject up to seen.size() paths in acceptResult():
                Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {

                    BytesRefBuilder scratchBytes = new BytesRefBuilder();

                    @Override
                    protected void addIfCompetitive(Util.FSTPath<Long> path) {
                        if (path.arc.label != separator) {
                            //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                            super.addIfCompetitive(path);
                        } else {
                        //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                        }
                    }

                    @Override
                    protected boolean acceptResult(IntsRef input, Long output) {
                        Util.toBytesRef(input, scratchBytes);
                        finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
                        int lenSav = finalLastToken.length();
                        finalLastToken.append(scratchBytes);
                        //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
                        boolean ret = seen.contains(finalLastToken.get()) == false;
                        finalLastToken.setLength(lenSav);
                        return ret;
                    }
                };
                // since this search is initialized with a single start node 
                // it is okay to start with an empty input path here
                searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
                completions = searcher.search();
                assert completions.isComplete;
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            int prefixLength = token.length();
            BytesRefBuilder suffix = new BytesRefBuilder();
            nextCompletion: for (Result<Long> completion : completions) {
                token.setLength(prefixLength);
                // append suffix
                Util.toBytesRef(completion.input, suffix);
                token.append(suffix);
                //System.out.println("    completion " + token.utf8ToString());
                // Skip this path if a higher-order model already
                // saw/predicted its last token:
                BytesRef lastToken = token.get();
                for (int i = token.length() - 1; i >= 0; i--) {
                    if (token.byteAt(i) == separator) {
                        assert token.length() - i - 1 > 0;
                        lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                        break;
                    }
                }
                if (seen.contains(lastToken)) {
                    //System.out.println("      skip dup " + lastToken.utf8ToString());
                    continue nextCompletion;
                }
                seen.add(BytesRef.deepCopyOf(lastToken));
                spare.copyUTF8Bytes(token.get());
                LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
                results.add(result);
                assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
            }
            backoff *= ALPHA;
        }
        Collections.sort(results, new Comparator<LookupResult>() {

            @Override
            public int compare(LookupResult a, LookupResult b) {
                if (a.value > b.value) {
                    return -1;
                } else if (a.value < b.value) {
                    return 1;
                } else {
                    // Tie break by UTF16 sort order:
                    return ((String) a.key).compareTo((String) b.key);
                }
            }
        });
        if (results.size() > num) {
            results.subList(num, results.size()).clear();
        }
        return results;
    }
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) Result(org.apache.lucene.util.fst.Util.Result) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)53 TokenStream (org.apache.lucene.analysis.TokenStream)35 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)25 StringReader (java.io.StringReader)20 IOException (java.io.IOException)15 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4