use of org.apache.lucene.analysis.icu.segmentation.ICUTokenizer in project elasticsearch by elastic.
the class IcuTokenizerFactoryTests method testIcuCustomizeRuleFile.
public void testIcuCustomizeRuleFile() throws IOException {
TestAnalysis analysis = createTestAnalysis();
// test the tokenizer with single rule file
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("user_rule_tokenizer");
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
Reader reader = new StringReader("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "One-two", "punch", "Brang", "not", "brung-it", "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
}
use of org.apache.lucene.analysis.icu.segmentation.ICUTokenizer in project elasticsearch by elastic.
the class IcuTokenizerFactoryTests method testSimpleIcuTokenizer.
public void testSimpleIcuTokenizer() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("icu_tokenizer");
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
Reader reader = new StringReader("向日葵, one-two");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "向日葵", "one", "two" });
}
use of org.apache.lucene.analysis.icu.segmentation.ICUTokenizer in project elasticsearch by elastic.
the class IcuTokenizerFactoryTests method testMultipleIcuCustomizeRuleFiles.
public void testMultipleIcuCustomizeRuleFiles() throws IOException {
TestAnalysis analysis = createTestAnalysis();
// test the tokenizer with two rule files
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("multi_rule_tokenizer");
ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
StringReader reader = new StringReader("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English.");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "Some", "English", "Немного русский. ", "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ", "More", "English" });
}
use of org.apache.lucene.analysis.icu.segmentation.ICUTokenizer in project stanbol by apache.
the class QueryUtils method parseWildcardQueryTerms.
/**
* Parses query terms for Wildcard queries as described in the first
* comment of STANBOL-607. <p>
* As an example the String:
* <code><pre>
* "This is a te?t for multi* Toke? Wildc\*adrd Se?rche*
* </pre></code>
* is converted in the query terms
* <code><pre>
* ["This is a","te?t","multi*","toke?","Wildc\*adrd","se?rche*"]
* </pre></code>
* NOTE: that tokens that include are converted to lower case
* @param value the value
* @param loewercaseWildcardTokens if query elements that include a wildcard
* should be converted to lower case.
* @return the query terms
* @throws IOException
*/
private static QueryTerm[] parseWildcardQueryTerms(String value, boolean loewercaseWildcardTokens) {
//This assumes that the Tokenizer does tokenize '*' and '?',
//what makes it a little bit tricky.
Tokenizer tokenizer = new ICUTokenizer(new StringReader(value), tokenizerConfig);
Matcher m = WILDCARD_QUERY_CHAR_PATTERN.matcher(value);
int next = m.find() ? m.start() + 1 : -1;
if (next < 0) {
//No wildcard
return new QueryTerm[] { new QueryTerm(value, false, true, true) };
}
ArrayList<QueryTerm> queryElements = new ArrayList<QueryTerm>(5);
int lastAdded = -1;
int lastOffset = 0;
boolean foundWildcard = false;
//Lucene tokenizer are really low level ...
try {
//starting with Solr4 reset MUST BE called before using
tokenizer.reset();
while (tokenizer.incrementToken()) {
//only interested in the start/end indexes of tokens
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
if (lastAdded < 0) {
//rest with this token
lastAdded = offset.startOffset();
}
if (foundWildcard) {
// query term.
if (offset.startOffset() > lastOffset + 1) {
//(1)
String queryElement = value.substring(lastAdded, lastOffset + 1);
if (loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
queryElements.add(new QueryTerm(queryElement, true, false, true));
//previous token consumed
lastAdded = offset.startOffset();
//set to the start of the current token
foundWildcard = false;
} else if (next != offset.endOffset()) {
//(2)
String queryElement = value.substring(lastAdded, offset.endOffset());
if (loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
queryElements.add(new QueryTerm(queryElement, true, false, true));
//consume the current token
lastAdded = -1;
foundWildcard = false;
}
}
if (next == offset.endOffset()) {
//end of current token is '*' or '?'
//search next '*', '?' in value
next = m.find() ? m.start() + 1 : -1;
// a single word
if (!foundWildcard && lastAdded < lastOffset) {
String queryElement = value.substring(lastAdded, lastOffset);
queryElements.add(new QueryTerm(queryElement, false, true, true));
lastAdded = offset.startOffset();
}
//else multiple wildcards in a single token
foundWildcard = true;
}
lastOffset = offset.endOffset();
}
} catch (IOException e) {
//StringReader can not throw IOExceptions
throw new IllegalStateException(e);
}
if (lastAdded >= 0 && lastAdded < value.length()) {
String queryElement = value.substring(lastAdded, value.length());
if (foundWildcard && loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
if (foundWildcard) {
queryElements.add(new QueryTerm(queryElement, true, false, true));
} else {
queryElements.add(new QueryTerm(queryElement, false, true, true));
}
}
return queryElements.toArray(new QueryTerm[queryElements.size()]);
}
Aggregations