Search in sources :

Example 6 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestAsciiFoldingFilterFactory method testMultiTermAnalysis.

public void testMultiTermAnalysis() throws IOException {
    TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
    TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
    factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) MultiTermAwareComponent(org.apache.lucene.analysis.util.MultiTermAwareComponent) HashMap(java.util.HashMap) Token(org.apache.lucene.analysis.Token) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 7 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPostingsOffsets method testRandom.

public void testRandom() throws Exception {
    // token -> docID -> tokens
    final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    final int numDocs = atLeast(20);
    //final int numDocs = atLeast(5);
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: randomize what IndexOptions we use; also test
    // changing this up in one IW buffered segment...:
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random().nextBoolean());
        ft.setStoreTermVectorPositions(random().nextBoolean());
    }
    for (int docCount = 0; docCount < numDocs; docCount++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField("id", docCount));
        List<Token> tokens = new ArrayList<>();
        final int numTokens = atLeast(100);
        //final int numTokens = atLeast(20);
        int pos = -1;
        int offset = 0;
        //System.out.println("doc id=" + docCount);
        for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
            final String text;
            if (random().nextBoolean()) {
                text = "a";
            } else if (random().nextBoolean()) {
                text = "b";
            } else if (random().nextBoolean()) {
                text = "c";
            } else {
                text = "d";
            }
            int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
            if (tokenCount == 0 && posIncr == 0) {
                posIncr = 1;
            }
            final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
            final int tokenOffset = random().nextInt(5);
            final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
            if (!actualTokens.containsKey(text)) {
                actualTokens.put(text, new HashMap<Integer, List<Token>>());
            }
            final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
            if (!postingsByDoc.containsKey(docCount)) {
                postingsByDoc.put(docCount, new ArrayList<Token>());
            }
            postingsByDoc.get(docCount).add(token);
            tokens.add(token);
            pos += posIncr;
            // stuff abs position into type:
            token.setType("" + pos);
            offset += offIncr + tokenOffset;
        //System.out.println("  " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
        }
        doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
        w.addDocument(doc);
    }
    final DirectoryReader r = w.getReader();
    w.close();
    final String[] terms = new String[] { "a", "b", "c", "d" };
    for (LeafReaderContext ctx : r.leaves()) {
        // TODO: improve this
        LeafReader sub = ctx.reader();
        //System.out.println("\nsub=" + sub);
        final TermsEnum termsEnum = sub.fields().terms("content").iterator();
        PostingsEnum docs = null;
        PostingsEnum docsAndPositions = null;
        PostingsEnum docsAndPositionsAndOffsets = null;
        int[] docIDToID = new int[sub.maxDoc()];
        NumericDocValues values = DocValues.getNumeric(sub, "id");
        for (int i = 0; i < sub.maxDoc(); i++) {
            assertEquals(i, values.nextDoc());
            docIDToID[i] = (int) values.longValue();
        }
        for (String term : terms) {
            //System.out.println("  term=" + term);
            if (termsEnum.seekExact(new BytesRef(term))) {
                docs = termsEnum.postings(docs);
                assertNotNull(docs);
                int doc;
                //System.out.println("    doc/freq");
                while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docs.freq());
                }
                // explicitly exclude offsets here
                docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
                assertNotNull(docsAndPositions);
                //System.out.println("    doc/freq/pos");
                while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docsAndPositions.freq());
                    for (Token token : expected) {
                        int pos = Integer.parseInt(token.type());
                        //System.out.println("        pos=" + pos);
                        assertEquals(pos, docsAndPositions.nextPosition());
                    }
                }
                docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
                assertNotNull(docsAndPositionsAndOffsets);
                //System.out.println("    doc/freq/pos/offs");
                while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
                    for (Token token : expected) {
                        int pos = Integer.parseInt(token.type());
                        //System.out.println("        pos=" + pos);
                        assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
                        assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
                        assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
                    }
                }
            }
        }
    // TODO: test advance:
    }
    r.close();
    dir.close();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) ArrayList(java.util.ArrayList) List(java.util.List) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPhraseQuery method testZeroPosIncr.

/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
    Directory dir = newDirectory();
    final Token[] tokens = new Token[3];
    tokens[0] = new Token();
    tokens[0].append("a");
    tokens[0].setPositionIncrement(1);
    tokens[1] = new Token();
    tokens[1].append("aa");
    tokens[1].setPositionIncrement(0);
    tokens[2] = new Token();
    tokens[2].append("b");
    tokens[2].setPositionIncrement(1);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(new TextField("field", new CannedTokenStream(tokens)));
    writer.addDocument(doc);
    IndexReader r = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(r);
    // Sanity check; simple "a b" phrase:
    PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
    // Now with "a|aa b"
    pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "aa"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
    // Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
    pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "z"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
    r.close();
    dir.close();
}
Also used : Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 9 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestMultiPhraseQuery method makeToken.

private static Token makeToken(String text, int posIncr) {
    final Token t = new Token();
    t.append(text);
    t.setPositionIncrement(posIncr);
    return t;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 10 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestMultiPhraseQuery method testZeroPosIncrSloppyMpqAnd.

/**
   * MPQ AND Mode - Manually creating a multiple phrase query
   */
public void testZeroPosIncrSloppyMpqAnd() throws IOException {
    final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
    int pos = -1;
    for (Token tap : INCR_0_QUERY_TOKENS_AND) {
        pos += tap.getPositionIncrement();
        //AND logic
        mpqb.add(new Term[] { new Term("field", tap.toString()) }, pos);
    }
    doTestZeroPosIncrSloppy(mpqb.build(), 0);
    mpqb.setSlop(1);
    doTestZeroPosIncrSloppy(mpqb.build(), 0);
    mpqb.setSlop(2);
    doTestZeroPosIncrSloppy(mpqb.build(), 1);
}
Also used : Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8