Search in sources :

Example 36 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class ShingleFilterTest method testTwoTrailingHolesTriShingleWithTokenFiller.

public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
    // Analyzing "purple wizard of the", where of and the are removed as a
    // stopwords, leaving two trailing holes:
    Token[] inputTokens = new Token[] { createToken("purple", 0, 6), createToken("wizard", 7, 13) };
    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken("--");
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken("");
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken(null);
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken(null);
    filter.setTokenSeparator(null);
    assertTokenStreamContents(filter, new String[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
}
Also used : Token(org.apache.lucene.analysis.Token) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 37 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class WordBreakSolrSpellCheckerTest method testStandAlone.

@Test
public void testStandAlone() throws Exception {
    SolrCore core = h.getCore();
    WordBreakSolrSpellChecker checker = new WordBreakSolrSpellChecker();
    NamedList<String> params = new NamedList<>();
    params.add("field", "lowerfilt");
    params.add(WordBreakSolrSpellChecker.PARAM_BREAK_WORDS, "true");
    params.add(WordBreakSolrSpellChecker.PARAM_COMBINE_WORDS, "true");
    params.add(WordBreakSolrSpellChecker.PARAM_MAX_CHANGES, "10");
    checker.init(params, core);
    RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
    QueryConverter qc = new SpellingQueryConverter();
    qc.setAnalyzer(new MockAnalyzer(random()));
    {
        //Prior to SOLR-8175, the required term would cause an AIOOBE.
        Collection<Token> tokens = qc.convert("+pine apple good ness");
        SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader(), 10);
        SpellingResult result = checker.getSuggestions(spellOpts);
        searcher.decref();
        assertTrue(result != null && result.getSuggestions() != null);
        assertTrue(result.getSuggestions().size() == 5);
    }
    Collection<Token> tokens = qc.convert("paintable pine apple good ness");
    SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader(), 10);
    SpellingResult result = checker.getSuggestions(spellOpts);
    searcher.decref();
    assertTrue(result != null && result.getSuggestions() != null);
    assertTrue(result.getSuggestions().size() == 9);
    for (Map.Entry<Token, LinkedHashMap<String, Integer>> s : result.getSuggestions().entrySet()) {
        Token orig = s.getKey();
        String[] corr = s.getValue().keySet().toArray(new String[0]);
        if (orig.toString().equals("paintable")) {
            assertTrue(orig.startOffset() == 0);
            assertTrue(orig.endOffset() == 9);
            assertTrue(orig.length() == 9);
            assertTrue(corr.length == 3);
            //1 op ; max doc freq=5
            assertTrue(corr[0].equals("paint able"));
            //1 op ; max doc freq=2      
            assertTrue(corr[1].equals("pain table"));
            //2 ops
            assertTrue(corr[2].equals("pa in table"));
        } else if (orig.toString().equals("pine apple")) {
            assertTrue(orig.startOffset() == 10);
            assertTrue(orig.endOffset() == 20);
            assertTrue(orig.length() == 10);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("pineapple"));
        } else if (orig.toString().equals("paintable pine")) {
            assertTrue(orig.startOffset() == 0);
            assertTrue(orig.endOffset() == 14);
            assertTrue(orig.length() == 14);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("paintablepine"));
        } else if (orig.toString().equals("good ness")) {
            assertTrue(orig.startOffset() == 21);
            assertTrue(orig.endOffset() == 30);
            assertTrue(orig.length() == 9);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("goodness"));
        } else if (orig.toString().equals("pine apple good ness")) {
            assertTrue(orig.startOffset() == 10);
            assertTrue(orig.endOffset() == 30);
            assertTrue(orig.length() == 20);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("pineapplegoodness"));
        } else if (orig.toString().equals("pine")) {
            assertTrue(orig.startOffset() == 10);
            assertTrue(orig.endOffset() == 14);
            assertTrue(orig.length() == 4);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("pi ne"));
        } else if (orig.toString().equals("pine")) {
            assertTrue(orig.startOffset() == 10);
            assertTrue(orig.endOffset() == 14);
            assertTrue(orig.length() == 4);
            assertTrue(corr.length == 1);
            assertTrue(corr[0].equals("pi ne"));
        } else if (orig.toString().equals("apple")) {
            assertTrue(orig.startOffset() == 15);
            assertTrue(orig.endOffset() == 20);
            assertTrue(orig.length() == 5);
            assertTrue(corr.length == 0);
        } else if (orig.toString().equals("good")) {
            assertTrue(orig.startOffset() == 21);
            assertTrue(orig.endOffset() == 25);
            assertTrue(orig.length() == 4);
            assertTrue(corr.length == 0);
        } else if (orig.toString().equals("ness")) {
            assertTrue(orig.startOffset() == 26);
            assertTrue(orig.endOffset() == 30);
            assertTrue(orig.length() == 4);
            assertTrue(corr.length == 0);
        } else {
            fail("Unexpected original result: " + orig);
        }
    }
}
Also used : SolrCore(org.apache.solr.core.SolrCore) NamedList(org.apache.solr.common.util.NamedList) Token(org.apache.lucene.analysis.Token) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) LinkedHashMap(java.util.LinkedHashMap) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Collection(java.util.Collection) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Test(org.junit.Test)

Example 38 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class ShingleFilterTest method createToken.

private static Token createToken(String term, int start, int offset, int positionIncrement) {
    Token token = new Token();
    token.setOffset(start, offset);
    token.copyBuffer(term.toCharArray(), 0, term.length());
    token.setPositionIncrement(positionIncrement);
    return token;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 39 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestAsciiFoldingFilterFactory method testMultiTermAnalysis.

public void testMultiTermAnalysis() throws IOException {
    TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
    TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
    factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
    stream = new CannedTokenStream(new Token("Été", 0, 3));
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "Ete" });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) MultiTermAwareComponent(org.apache.lucene.analysis.util.MultiTermAwareComponent) HashMap(java.util.HashMap) Token(org.apache.lucene.analysis.Token) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 40 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPostingsOffsets method testRandom.

public void testRandom() throws Exception {
    // token -> docID -> tokens
    final Map<String, Map<Integer, List<Token>>> actualTokens = new HashMap<>();
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    final int numDocs = atLeast(20);
    //final int numDocs = atLeast(5);
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: randomize what IndexOptions we use; also test
    // changing this up in one IW buffered segment...:
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random().nextBoolean());
        ft.setStoreTermVectorPositions(random().nextBoolean());
    }
    for (int docCount = 0; docCount < numDocs; docCount++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField("id", docCount));
        List<Token> tokens = new ArrayList<>();
        final int numTokens = atLeast(100);
        //final int numTokens = atLeast(20);
        int pos = -1;
        int offset = 0;
        //System.out.println("doc id=" + docCount);
        for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) {
            final String text;
            if (random().nextBoolean()) {
                text = "a";
            } else if (random().nextBoolean()) {
                text = "b";
            } else if (random().nextBoolean()) {
                text = "c";
            } else {
                text = "d";
            }
            int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
            if (tokenCount == 0 && posIncr == 0) {
                posIncr = 1;
            }
            final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
            final int tokenOffset = random().nextInt(5);
            final Token token = makeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset);
            if (!actualTokens.containsKey(text)) {
                actualTokens.put(text, new HashMap<Integer, List<Token>>());
            }
            final Map<Integer, List<Token>> postingsByDoc = actualTokens.get(text);
            if (!postingsByDoc.containsKey(docCount)) {
                postingsByDoc.put(docCount, new ArrayList<Token>());
            }
            postingsByDoc.get(docCount).add(token);
            tokens.add(token);
            pos += posIncr;
            // stuff abs position into type:
            token.setType("" + pos);
            offset += offIncr + tokenOffset;
        //System.out.println("  " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
        }
        doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
        w.addDocument(doc);
    }
    final DirectoryReader r = w.getReader();
    w.close();
    final String[] terms = new String[] { "a", "b", "c", "d" };
    for (LeafReaderContext ctx : r.leaves()) {
        // TODO: improve this
        LeafReader sub = ctx.reader();
        //System.out.println("\nsub=" + sub);
        final TermsEnum termsEnum = sub.fields().terms("content").iterator();
        PostingsEnum docs = null;
        PostingsEnum docsAndPositions = null;
        PostingsEnum docsAndPositionsAndOffsets = null;
        int[] docIDToID = new int[sub.maxDoc()];
        NumericDocValues values = DocValues.getNumeric(sub, "id");
        for (int i = 0; i < sub.maxDoc(); i++) {
            assertEquals(i, values.nextDoc());
            docIDToID[i] = (int) values.longValue();
        }
        for (String term : terms) {
            //System.out.println("  term=" + term);
            if (termsEnum.seekExact(new BytesRef(term))) {
                docs = termsEnum.postings(docs);
                assertNotNull(docs);
                int doc;
                //System.out.println("    doc/freq");
                while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docs.freq());
                }
                // explicitly exclude offsets here
                docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
                assertNotNull(docsAndPositions);
                //System.out.println("    doc/freq/pos");
                while ((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docsAndPositions.freq());
                    for (Token token : expected) {
                        int pos = Integer.parseInt(token.type());
                        //System.out.println("        pos=" + pos);
                        assertEquals(pos, docsAndPositions.nextPosition());
                    }
                }
                docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
                assertNotNull(docsAndPositionsAndOffsets);
                //System.out.println("    doc/freq/pos/offs");
                while ((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
                    //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
                    assertNotNull(expected);
                    assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
                    for (Token token : expected) {
                        int pos = Integer.parseInt(token.type());
                        //System.out.println("        pos=" + pos);
                        assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
                        assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
                        assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
                    }
                }
            }
        }
    // TODO: test advance:
    }
    r.close();
    dir.close();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) ArrayList(java.util.ArrayList) List(java.util.List) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8