Search in sources :

Example 1 with AttributeFactory

use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.

the class TestKeywordTokenizer method testFactory.

public void testFactory() {
    Map<String, String> args = new HashMap<>();
    KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
    AttributeFactory attributeFactory = newAttributeFactory();
    Tokenizer tokenizer = factory.create(attributeFactory);
    assertEquals(KeywordTokenizer.class, tokenizer.getClass());
}
Also used : HashMap(java.util.HashMap) AttributeFactory(org.apache.lucene.util.AttributeFactory) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 2 with AttributeFactory

use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.

the class TestUnicodeWhitespaceTokenizer method testParamsFactory.

public void testParamsFactory() throws IOException {
    // negative maxTokenLen
    IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
    // zero maxTokenLen
    iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
    // Added random param, should throw illegal error
    iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
    assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
    // tokeniser will split at 5, Token | izer, no matter what happens 
    WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
    AttributeFactory attributeFactory = newAttributeFactory();
    Tokenizer tokenizer = factory.create(attributeFactory);
    StringReader reader = new StringReader("Tokenizer šœtest");
    tokenizer.setReader(reader);
    assertTokenStreamContents(tokenizer, new String[] { "Token", "izer", "šœtes", "t" });
    // tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens 
    factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
    attributeFactory = newAttributeFactory();
    tokenizer = factory.create(attributeFactory);
    reader = new StringReader("TokenizerĀ test");
    tokenizer.setReader(reader);
    assertTokenStreamContents(tokenizer, new String[] { "To", "ke", "ni", "ze", "r", "te", "st" });
    // tokeniser will split at 10, no matter what happens, 
    // but tokens' length are less than that
    factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
    attributeFactory = newAttributeFactory();
    tokenizer = factory.create(attributeFactory);
    reader = new StringReader("TokenizerĀ test");
    tokenizer.setReader(reader);
    assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "test" });
}
Also used : StringReader(java.io.StringReader) AttributeFactory(org.apache.lucene.util.AttributeFactory) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 3 with AttributeFactory

use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.

the class FieldAnalysisRequestHandlerTest method testCustomAttribute.

//See SOLR-8460
@Test
public void testCustomAttribute() throws Exception {
    FieldAnalysisRequest request = new FieldAnalysisRequest();
    request.addFieldType("skutype1");
    request.setFieldValue("hi, 3456-12 a Test");
    request.setShowMatch(false);
    FieldType fieldType = new TextField();
    Analyzer analyzer = new TokenizerChain(new TokenizerFactory(Collections.emptyMap()) {

        @Override
        public Tokenizer create(AttributeFactory factory) {
            return new CustomTokenizer(factory);
        }
    }, new TokenFilterFactory[] { new TokenFilterFactory(Collections.emptyMap()) {

        @Override
        public TokenStream create(TokenStream input) {
            return new CustomTokenFilter(input);
        }
    } });
    fieldType.setIndexAnalyzer(analyzer);
    NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
    // just test that we see "900" in the flags attribute here
    List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
    // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
    assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) NamedList(org.apache.solr.common.util.NamedList) AttributeFactory(org.apache.lucene.util.AttributeFactory) Analyzer(org.apache.lucene.analysis.Analyzer) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) FieldType(org.apache.solr.schema.FieldType) TokenizerChain(org.apache.solr.analysis.TokenizerChain) TextField(org.apache.solr.schema.TextField) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) FieldAnalysisRequest(org.apache.solr.client.solrj.request.FieldAnalysisRequest) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Test(org.junit.Test)

Example 4 with AttributeFactory

use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.

the class TestUnicodeWhitespaceTokenizer method testFactory.

public void testFactory() {
    Map<String, String> args = new HashMap<>();
    args.put("rule", "unicode");
    WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(args);
    AttributeFactory attributeFactory = newAttributeFactory();
    Tokenizer tokenizer = factory.create(attributeFactory);
    assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
}
Also used : HashMap(java.util.HashMap) AttributeFactory(org.apache.lucene.util.AttributeFactory) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 5 with AttributeFactory

use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.

the class Analyzer method normalize.

/**
   * Normalize a string down to the representation that it would have in the
   * index.
   * <p>
   * This is typically used by query parsers in order to generate a query on
   * a given term, without tokenizing or stemming, which are undesirable if
   * the string to analyze is a partial word (eg. in case of a wildcard or
   * fuzzy query).
   * <p>
   * This method uses {@link #initReaderForNormalization(String, Reader)} in
   * order to apply necessary character-level normalization and then
   * {@link #normalize(String, TokenStream)} in order to apply the normalizing
   * token filters.
   */
public final BytesRef normalize(final String fieldName, final String text) {
    try {
        // apply char filters
        final String filteredText;
        try (Reader reader = new StringReader(text)) {
            Reader filterReader = initReaderForNormalization(fieldName, reader);
            char[] buffer = new char[64];
            StringBuilder builder = new StringBuilder();
            for (; ; ) {
                final int read = filterReader.read(buffer, 0, buffer.length);
                if (read == -1) {
                    break;
                }
                builder.append(buffer, 0, read);
            }
            filteredText = builder.toString();
        } catch (IOException e) {
            throw new IllegalStateException("Normalization threw an unexpected exeption", e);
        }
        final AttributeFactory attributeFactory = attributeFactory(fieldName);
        try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) {
            final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
            ts.reset();
            if (ts.incrementToken() == false) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + this + " and input \"" + text + "\"");
            }
            final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
            if (ts.incrementToken()) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\"");
            }
            ts.end();
            return term;
        }
    } catch (IOException e) {
        throw new IllegalStateException("Normalization threw an unexpected exeption", e);
    }
}
Also used : Reader(java.io.Reader) StringReader(java.io.StringReader) AttributeFactory(org.apache.lucene.util.AttributeFactory) IOException(java.io.IOException) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) StringReader(java.io.StringReader) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

AttributeFactory (org.apache.lucene.util.AttributeFactory)6 Tokenizer (org.apache.lucene.analysis.Tokenizer)5 StringReader (java.io.StringReader)3 HashMap (java.util.HashMap)2 IOException (java.io.IOException)1 Reader (java.io.Reader)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)1 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)1 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)1 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)1 BytesRef (org.apache.lucene.util.BytesRef)1 TokenizerChain (org.apache.solr.analysis.TokenizerChain)1 FieldAnalysisRequest (org.apache.solr.client.solrj.request.FieldAnalysisRequest)1 NamedList (org.apache.solr.common.util.NamedList)1 FieldType (org.apache.solr.schema.FieldType)1