Search in sources :

Example 51 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestPhraseQuery method testRandomPhrases.

public void testRandomPhrases() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
    List<List<String>> docs = new ArrayList<>();
    Document d = new Document();
    Field f = newTextField("f", "", Field.Store.NO);
    d.add(f);
    Random r = random();
    int NUM_DOCS = atLeast(10);
    for (int i = 0; i < NUM_DOCS; i++) {
        // must be > 4096 so it spans multiple chunks
        int termCount = TestUtil.nextInt(random(), 4097, 8200);
        List<String> doc = new ArrayList<>();
        StringBuilder sb = new StringBuilder();
        while (doc.size() < termCount) {
            if (r.nextInt(5) == 1 || docs.size() == 0) {
                // make new non-empty-string term
                String term;
                while (true) {
                    term = TestUtil.randomUnicodeString(r);
                    if (term.length() > 0) {
                        break;
                    }
                }
                try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
                    CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
                    ts.reset();
                    while (ts.incrementToken()) {
                        String text = termAttr.toString();
                        doc.add(text);
                        sb.append(text).append(' ');
                    }
                    ts.end();
                }
            } else {
                // pick existing sub-phrase
                List<String> lastDoc = docs.get(r.nextInt(docs.size()));
                int len = TestUtil.nextInt(r, 1, 10);
                int start = r.nextInt(lastDoc.size() - len);
                for (int k = start; k < start + len; k++) {
                    String t = lastDoc.get(k);
                    doc.add(t);
                    sb.append(t).append(' ');
                }
            }
        }
        docs.add(doc);
        f.setStringValue(sb.toString());
        w.addDocument(d);
    }
    IndexReader reader = w.getReader();
    IndexSearcher s = newSearcher(reader);
    w.close();
    // now search
    int num = atLeast(10);
    for (int i = 0; i < num; i++) {
        int docID = r.nextInt(docs.size());
        List<String> doc = docs.get(docID);
        final int numTerm = TestUtil.nextInt(r, 2, 20);
        final int start = r.nextInt(doc.size() - numTerm);
        PhraseQuery.Builder builder = new PhraseQuery.Builder();
        StringBuilder sb = new StringBuilder();
        for (int t = start; t < start + numTerm; t++) {
            builder.add(new Term("f", doc.get(t)), t);
            sb.append(doc.get(t)).append(' ');
        }
        PhraseQuery pq = builder.build();
        TopDocs hits = s.search(pq, NUM_DOCS);
        boolean found = false;
        for (int j = 0; j < hits.scoreDocs.length; j++) {
            if (hits.scoreDocs[j].doc == docID) {
                found = true;
                break;
            }
        }
        assertTrue("phrase '" + sb + "' not found; start=" + start + ", it=" + i + ", expected doc " + docID, found);
    }
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) IndexReader(org.apache.lucene.index.IndexReader) ArrayList(java.util.ArrayList) List(java.util.List) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 52 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class EdgeNGramTokenFilterTest method testSupplementaryCharacters.

public void testSupplementaryCharacters() throws IOException {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer) tk).setReader(new StringReader(s));
    tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int end = Character.offsetByCodePoints(s, 0, i);
        assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 53 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class NGramTokenFilterTest method testSupplementaryCharacters.

public void testSupplementaryCharacters() throws IOException {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer) tk).setReader(new StringReader(s));
    tk = new NGramTokenFilter(tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int start = 0; start < codePointCount; ++start) {
        for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
            assertTrue(tk.incrementToken());
            assertEquals(0, offsetAtt.startOffset());
            assertEquals(s.length(), offsetAtt.endOffset());
            final int startIndex = Character.offsetByCodePoints(s, 0, start);
            final int endIndex = Character.offsetByCodePoints(s, 0, end);
            assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
        }
    }
    assertFalse(tk.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 54 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestJapaneseNumberFilter method analyze.

public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
    TokenStream stream = analyzer.tokenStream("dummy", reader);
    stream.reset();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        writer.write(termAttr.toString());
        writer.write("\n");
    }
    reader.close();
    writer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 55 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestJapaneseTokenizer method testSurrogates2.

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
    int numIterations = atLeast(10000);
    for (int i = 0; i < numIterations; i++) {
        if (VERBOSE) {
            System.out.println("\nTEST: iter=" + i);
        }
        String s = TestUtil.randomUnicodeString(random(), 100);
        try (TokenStream ts = analyzer.tokenStream("foo", s)) {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                assertTrue(UnicodeUtil.validUTF16String(termAtt));
            }
            ts.end();
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6