use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class SimpleNaiveBayesDocumentClassifier method getTokenArray.
/**
* Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
*
* @param tokenizedText the tokenized content of a field
* @return a {@code String} array of the resulting tokens
* @throws java.io.IOException If tokenization fails because there is a low-level I/O error
*/
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
Collection<String> tokens = new LinkedList<>();
CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
tokenizedText.reset();
while (tokenizedText.incrementToken()) {
tokens.add(charTermAttribute.toString());
}
tokenizedText.end();
tokenizedText.close();
return tokens.toArray(new String[tokens.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class BooleanPerceptronClassifier method assignClass.
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<Boolean> assignClass(String text) throws IOException {
Long output = 0L;
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
String s = charTermAttribute.toString();
Long d = Util.get(fst, new BytesRef(s));
if (d != null) {
output += d;
}
}
tokenStream.end();
}
double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
return new ClassificationResult<>(output >= bias, score);
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project jackrabbit-oak by apache.
the class LuceneIndex method tokenize.
/**
* Tries to merge back tokens that are split on relevant fulltext query
* wildcards ('*' or '?')
*
*
* @param text
* @param analyzer
* @return
*/
static List<String> tokenize(String text, Analyzer analyzer) {
List<String> tokens = new ArrayList<String>();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
// TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
int poz = 0;
boolean hasFulltextToken = false;
StringBuilder token = new StringBuilder();
while (stream.incrementToken()) {
String term = termAtt.toString();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (start > poz) {
for (int i = poz; i < start; i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
hasFulltextToken = true;
}
}
}
}
poz = end;
if (hasFulltextToken) {
token.append(term);
hasFulltextToken = false;
} else {
if (token.length() > 0) {
tokens.add(token.toString());
}
token = new StringBuilder();
token.append(term);
}
}
// consume to the end of the string
if (poz < text.length()) {
for (int i = poz; i < text.length(); i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
}
}
}
}
if (token.length() > 0) {
tokens.add(token.toString());
}
stream.end();
} catch (IOException e) {
LOG.error("Building fulltext query failed", e.getMessage());
return null;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (IOException e) {
// ignore
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestASCIIFoldingFilter method testUnmodifiedLetters.
// Test that we do not emit duplicated tokens when preserve original is on
public void testUnmodifiedLetters() throws Exception {
TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
assertNextTerms("§", "§", filter, termAtt);
assertNextTerms("¦", "¦", filter, termAtt);
assertNextTerms("¤", "¤", filter, termAtt);
assertNextTerms("END", "END", filter, termAtt);
assertFalse(filter.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestIndexWriter method testNegativePositions.
// LUCENE-1255
public void testNegativePositions() throws Throwable {
final TokenStream tokens = new TokenStream() {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
final Iterator<String> terms = Arrays.asList("a", "b", "c").iterator();
boolean first = true;
@Override
public boolean incrementToken() {
if (!terms.hasNext())
return false;
clearAttributes();
termAtt.append(terms.next());
posIncrAtt.setPositionIncrement(first ? 0 : 1);
first = false;
return true;
}
};
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
doc.add(new TextField("field", tokens));
expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
w.close();
dir.close();
}
Aggregations