Search in sources :

Example 16 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestTypeTokenFilterFactory method testCreationWithBlackList.

public void testCreationWithBlackList() throws Exception {
    TokenFilterFactory factory = tokenFilterFactory("Type", "types", "stoptypes-1.txt, stoptypes-2.txt");
    CannedTokenStream input = new CannedTokenStream();
    factory.create(input);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 17 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestTypeTokenFilterFactory method testCreationWithWhiteList.

public void testCreationWithWhiteList() throws Exception {
    TokenFilterFactory factory = tokenFilterFactory("Type", "types", "stoptypes-1.txt, stoptypes-2.txt", "useWhitelist", "true");
    CannedTokenStream input = new CannedTokenStream();
    factory.create(input);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 18 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class ShingleFilterTest method testTwoTrailingHolesTriShingleWithTokenFiller.

public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
    // Analyzing "purple wizard of the", where of and the are removed as a
    // stopwords, leaving two trailing holes:
    Token[] inputTokens = new Token[] { createToken("purple", 0, 6), createToken("wizard", 7, 13) };
    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken("--");
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken("");
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken(null);
    assertTokenStreamContents(filter, new String[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
    filter.setFillerToken(null);
    filter.setTokenSeparator(null);
    assertTokenStreamContents(filter, new String[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
}
Also used : Token(org.apache.lucene.analysis.Token) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 19 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class ShingleFilterTest method shingleFilterTest.

protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException {
    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
    filter.setTokenSeparator(tokenSeparator);
    filter.setOutputUnigrams(outputUnigrams);
    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 20 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestPostingsOffsets method checkTokens.

// TODO: more tests with other possibilities
private void checkTokens(Token[] field1, Token[] field2) throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
    boolean success = false;
    try {
        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        // store some term vectors for the checkindex cross-check
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(true);
        ft.setStoreTermVectorOffsets(true);
        Document doc = new Document();
        doc.add(new Field("body", new CannedTokenStream(field1), ft));
        doc.add(new Field("body", new CannedTokenStream(field2), ft));
        riw.addDocument(doc);
        riw.close();
        success = true;
    } finally {
        if (success) {
            IOUtils.close(dir);
        } else {
            IOUtils.closeWhileHandlingException(riw, dir);
        }
    }
}
Also used : StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Aggregations

CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)78 TokenStream (org.apache.lucene.analysis.TokenStream)43 Token (org.apache.lucene.analysis.Token)37 Directory (org.apache.lucene.store.Directory)33 Document (org.apache.lucene.document.Document)26 TextField (org.apache.lucene.document.TextField)22 Field (org.apache.lucene.document.Field)15 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)14 BytesRef (org.apache.lucene.util.BytesRef)14 FieldType (org.apache.lucene.document.FieldType)13 Term (org.apache.lucene.index.Term)13 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Tokenizer (org.apache.lucene.analysis.Tokenizer)11 IndexReader (org.apache.lucene.index.IndexReader)10 Analyzer (org.apache.lucene.analysis.Analyzer)9 StringField (org.apache.lucene.document.StringField)8 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)8 Reader (java.io.Reader)7 StringReader (java.io.StringReader)7 Input (org.apache.lucene.search.suggest.Input)7