use of org.apache.lucene.analysis.Token in project elasticsearch by elastic.
the class FlattenGraphTokenFilterFactoryTests method token.
private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
final Token t = new Token(term, startOffset, endOffset);
t.setPositionIncrement(posInc);
t.setPositionLength(posLength);
return t;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellCheckComponent method getTokens.
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<>();
assert analyzer != null;
try (TokenStream ts = analyzer.tokenStream("", q)) {
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
return result;
}
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellCheckComponent method process.
@Override
@SuppressWarnings("unchecked")
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
return;
}
boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
String q = params.get(SPELLCHECK_Q);
SolrSpellChecker spellChecker = getSpellChecker(params);
Collection<Token> tokens = null;
if (q != null) {
//we have a spell check param, tokenize it with the query analyzer applicable for this spellchecker
tokens = getTokens(q, spellChecker.getQueryAnalyzer());
} else {
q = rb.getQueryString();
if (q == null) {
q = params.get(CommonParams.Q);
}
tokens = queryConverter.convert(q);
}
if (tokens != null && tokens.isEmpty() == false) {
if (spellChecker != null) {
int count = params.getInt(SPELLCHECK_COUNT, 1);
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR, DEFAULT_ONLY_MORE_POPULAR);
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
int alternativeTermCount = params.getInt(SpellingParams.SPELLCHECK_ALTERNATIVE_TERM_COUNT, 0);
//If specified, this can be a discrete # of results, or a percentage of fq results.
Integer maxResultsForSuggest = maxResultsForSuggest(rb);
ModifiableSolrParams customParams = new ModifiableSolrParams();
for (String checkerName : getDictionaryNames(params)) {
customParams.add(getCustomParams(checkerName, params));
}
Integer hitsInteger = (Integer) rb.rsp.getToLog().get("hits");
long hits = 0;
if (hitsInteger == null) {
hits = rb.getNumberDocumentsFound();
} else {
hits = hitsInteger.longValue();
}
SpellingResult spellingResult = null;
if (maxResultsForSuggest == null || hits <= maxResultsForSuggest) {
SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
if (onlyMorePopular) {
suggestMode = SuggestMode.SUGGEST_MORE_POPULAR;
} else if (alternativeTermCount > 0) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
IndexReader reader = rb.req.getSearcher().getIndexReader();
SpellingOptions options = new SpellingOptions(tokens, reader, count, alternativeTermCount, suggestMode, extendedResults, accuracy, customParams);
spellingResult = spellChecker.getSuggestions(options);
} else {
spellingResult = new SpellingResult();
}
boolean isCorrectlySpelled = hits > (maxResultsForSuggest == null ? 0 : maxResultsForSuggest);
NamedList response = new SimpleOrderedMap();
NamedList suggestions = toNamedList(shardRequest, spellingResult, q, extendedResults);
response.add("suggestions", suggestions);
if (extendedResults) {
response.add("correctlySpelled", isCorrectlySpelled);
}
if (collate) {
addCollationsToResponse(params, spellingResult, rb, q, response, spellChecker.isSuggestionsMayOverlap());
}
if (shardRequest) {
addOriginalTermsToResponse(response, tokens);
}
rb.rsp.add("spellcheck", response);
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND, "Specified dictionaries do not exist: " + getDictionaryNameAsSingleString(getDictionaryNames(params)));
}
}
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class SpellCheckComponent method toNamedList.
protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults) {
NamedList result = new NamedList();
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
boolean hasSuggestions = false;
boolean hasZeroFrequencyToken = false;
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
Token inputToken = entry.getKey();
String tokenString = new String(inputToken.buffer(), 0, inputToken.length());
Map<String, Integer> theSuggestions = new LinkedHashMap<>(entry.getValue());
Iterator<String> sugIter = theSuggestions.keySet().iterator();
while (sugIter.hasNext()) {
String sug = sugIter.next();
if (sug.equals(tokenString)) {
sugIter.remove();
}
}
if (theSuggestions.size() > 0) {
hasSuggestions = true;
}
if (theSuggestions != null && (theSuggestions.size() > 0 || shardRequest)) {
SimpleOrderedMap suggestionList = new SimpleOrderedMap();
suggestionList.add("numFound", theSuggestions.size());
suggestionList.add("startOffset", inputToken.startOffset());
suggestionList.add("endOffset", inputToken.endOffset());
// ]
if (extendedResults && hasFreqInfo) {
suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));
ArrayList<SimpleOrderedMap> sugs = new ArrayList<>();
suggestionList.add("suggestion", sugs);
for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
SimpleOrderedMap sugEntry = new SimpleOrderedMap();
sugEntry.add("word", suggEntry.getKey());
sugEntry.add("freq", suggEntry.getValue());
sugs.add(sugEntry);
}
} else {
suggestionList.add("suggestion", theSuggestions.keySet());
}
if (hasFreqInfo) {
Integer tokenFrequency = spellingResult.getTokenFrequency(inputToken);
if (tokenFrequency == null || tokenFrequency == 0) {
hasZeroFrequencyToken = true;
}
}
result.add(tokenString, suggestionList);
}
}
return result;
}
use of org.apache.lucene.analysis.Token in project lucene-solr by apache.
the class TokenSourcesTest method testPayloads.
// LUCENE-5294
public void testPayloads() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
myFieldType.setStoreTermVectors(true);
myFieldType.setStoreTermVectorOffsets(true);
myFieldType.setStoreTermVectorPositions(true);
myFieldType.setStoreTermVectorPayloads(true);
curOffset = 0;
Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
Document doc = new Document();
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
assertEquals(1, reader.numDocs());
TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
ts.reset();
for (Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
}
assertFalse(ts.incrementToken());
reader.close();
dir.close();
}
Aggregations