Search in sources :

Example 46 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class SimpleNaiveBayesDocumentClassifier method getTokenArray.

/**
   * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
   *
   * @param tokenizedText the tokenized content of a field
   * @return a {@code String} array of the resulting tokens
   * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
   */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
    Collection<String> tokens = new LinkedList<>();
    CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
    tokenizedText.reset();
    while (tokenizedText.incrementToken()) {
        tokens.add(charTermAttribute.toString());
    }
    tokenizedText.end();
    tokenizedText.close();
    return tokens.toArray(new String[tokens.size()]);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) LinkedList(java.util.LinkedList)

Example 47 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class BooleanPerceptronClassifier method assignClass.

/**
   * {@inheritDoc}
   */
@Override
public ClassificationResult<Boolean> assignClass(String text) throws IOException {
    Long output = 0L;
    try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String s = charTermAttribute.toString();
            Long d = Util.get(fst, new BytesRef(s));
            if (d != null) {
                output += d;
            }
        }
        tokenStream.end();
    }
    double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
    return new ClassificationResult<>(output >= bias, score);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 48 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project jackrabbit-oak by apache.

the class LuceneIndex method tokenize.

/**
     * Tries to merge back tokens that are split on relevant fulltext query
     * wildcards ('*' or '?')
     *
     *
     * @param text
     * @param analyzer
     * @return
     */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        stream.reset();
        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
        // ignore
        }
    }
    return tokens;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException)

Example 49 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestASCIIFoldingFilter method testUnmodifiedLetters.

// Test that we do not emit duplicated tokens when preserve original is on
public void testUnmodifiedLetters() throws Exception {
    TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    filter.reset();
    assertNextTerms("§", "§", filter, termAtt);
    assertNextTerms("¦", "¦", filter, termAtt);
    assertNextTerms("¤", "¤", filter, termAtt);
    assertNextTerms("END", "END", filter, termAtt);
    assertFalse(filter.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 50 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestIndexWriter method testNegativePositions.

// LUCENE-1255
public void testNegativePositions() throws Throwable {
    final TokenStream tokens = new TokenStream() {

        final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

        final Iterator<String> terms = Arrays.asList("a", "b", "c").iterator();

        boolean first = true;

        @Override
        public boolean incrementToken() {
            if (!terms.hasNext())
                return false;
            clearAttributes();
            termAtt.append(terms.next());
            posIncrAtt.setPositionIncrement(first ? 0 : 1);
            first = false;
            return true;
        }
    };
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
    Document doc = new Document();
    doc.add(new TextField("field", tokens));
    expectThrows(IllegalArgumentException.class, () -> {
        w.addDocument(doc);
    });
    w.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) Iterator(java.util.Iterator) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6