use of com.yahoo.prelude.Index in project vespa by vespa-engine.
the class LiteralBoostSearcher method addLiterals.
private void addLiterals(RankItem rankTerms, Item item, IndexFacts.Session indexFacts) {
if (item == null)
return;
if (item instanceof NotItem) {
addLiterals(rankTerms, ((NotItem) item).getPositiveItem(), indexFacts);
} else if (item instanceof CompositeItem) {
for (Iterator<Item> i = ((CompositeItem) item).getItemIterator(); i.hasNext(); ) addLiterals(rankTerms, i.next(), indexFacts);
} else if (item instanceof TermItem) {
TermItem termItem = (TermItem) item;
Index index = indexFacts.getIndex(termItem.getIndexName());
if (index.getLiteralBoost())
rankTerms.addItem(new WordItem(toLowerCase(termItem.getRawWord()), index.getName() + "_literal"));
}
}
use of com.yahoo.prelude.Index in project vespa by vespa-engine.
the class StemmingSearcher method checkBlock.
private Item checkBlock(BlockItem b, boolean isCJK, Language language, IndexFacts.Session indexFacts, Map<Item, TaggableItem> reverseConnectivity) {
if (b instanceof PrefixItem || !b.isWords())
return (Item) b;
if (b.isFromQuery() && !b.isStemmed()) {
Index index = indexFacts.getIndex(b.getIndexName());
StemMode stemMode = index.getStemMode();
if (stemMode != StemMode.NONE)
return stem(b, isCJK, language, reverseConnectivity, index);
}
return (Item) b;
}
use of com.yahoo.prelude.Index in project vespa by vespa-engine.
the class JuniperSearcher method highlight.
private void highlight(boolean bolding, Iterator<Hit> hitsToHighlight, String summaryClass, IndexFacts.Session indexFacts) {
while (hitsToHighlight.hasNext()) {
Hit sniffHit = hitsToHighlight.next();
if (!(sniffHit instanceof FastHit))
continue;
FastHit hit = (FastHit) sniffHit;
if (summaryClass != null && !hit.isFilled(summaryClass))
continue;
Object searchDefinitionField = hit.getField(MAGIC_FIELD);
if (searchDefinitionField == null)
continue;
String searchDefinitionName = searchDefinitionField.toString();
for (String fieldName : hit.fields().keySet()) {
Index index = indexFacts.getIndex(fieldName, searchDefinitionName);
if (index.getDynamicSummary() || index.getHighlightSummary())
insertTags(hit.buildHitField(fieldName, true, true), bolding, index.getDynamicSummary());
}
}
}
use of com.yahoo.prelude.Index in project vespa by vespa-engine.
the class AbstractParser method generateLanguageDetectionTextFrom.
/**
* Do a best-effort attempt at creating a single string for language detection from only the relevant
* subset of tokens.
* The relevant tokens are text tokens which follows names of indexes which are tokenized.
*
* This method does not modify the position of the given token stream.
*/
private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) {
StringBuilder detectionText = new StringBuilder();
int initialPosition = tokens.getPosition();
while (tokens.hasNext()) {
// look for occurrences of text and text:text
while (// skip nonwords
!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) tokens.next();
if (!tokens.hasNext())
break;
String queryText;
Index index;
Token word1 = tokens.next();
if (is(Token.Kind.COLON, tokens.currentNoIgnore())) {
// colon
tokens.next();
Token word2 = tokens.next();
if (is(Token.Kind.WORD, word2))
queryText = word2.image;
else
queryText = "";
index = indexFacts.getIndex(word1.image);
if (index.isNull()) {
// interpret both as words
index = indexFacts.getIndex(defaultIndex);
queryText = word1.image + " " + queryText;
}
} else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) {
// colon
tokens.next();
// quote
tokens.next();
StringBuilder quotedContent = new StringBuilder();
while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) {
Token token = tokens.next();
if (is(Token.Kind.WORD, token))
quotedContent.append(token.image).append(" ");
}
tokens.next();
queryText = quotedContent.toString();
index = indexFacts.getIndex(word1.image);
if (index.isNull()) {
// interpret both as words
index = indexFacts.getIndex(defaultIndex);
queryText = word1.image + " " + queryText;
}
} else {
index = indexFacts.getIndex(defaultIndex);
queryText = word1.image;
}
if (queryText != null && index.hasPlainTokens())
detectionText.append(queryText).append(" ");
}
tokens.setPosition(initialPosition);
return detectionText.toString();
}
use of com.yahoo.prelude.Index in project vespa by vespa-engine.
the class Tokenizer method tokenize.
/**
* Resets this tokenizer and create tokens from the given string.
*
* @param string the string to tokenize
* @param defaultIndexName the name of the index to use as default
* @param indexFacts information about the indexes we will search
* @return a read-only list of tokens. This list can only be used by this thread
*/
@SuppressWarnings({ "deprecation" })
public // To avoid this we need to pass an IndexFacts.session down instead - easily done but not without breaking API's
List<Token> tokenize(String string, String defaultIndexName, IndexFacts.Session indexFacts) {
this.source = string;
tokens.clear();
parensToEat = 0;
Index topLevelIndex = Index.nullIndex;
Index defaultIndex = indexFacts.getIndex(defaultIndexName);
if (defaultIndexName != null) {
topLevelIndex = defaultIndex;
}
Index currentIndex = topLevelIndex;
for (int i = 0; i < source.length(); i++) {
if (currentIndex.isExact()) {
// currentIndex may change after seeing a colon below
i = consumeExact(i, currentIndex);
currentIndex = topLevelIndex;
} else {
i = consumeSpecialToken(i);
}
if (i >= source.length())
break;
int c = source.codePointAt(i);
if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) {
i = consumeWordOrNumber(i, currentIndex);
} else if (Character.isWhitespace(c)) {
addToken(SPACE, " ", i, i + 1);
} else if (c == '"' || c == '\u201C' || c == '\u201D' || c == '\u201E' || c == '\u201F' || c == '\u2039' || c == '\u203A' || c == '\u00AB' || c == '\u00BB' || c == '\u301D' || c == '\u301E' || c == '\u301F' || c == '\uFF02') {
addToken(QUOTE, "\"", i, i + 1);
} else if (c == '-' || c == '\uFF0D') {
addToken(MINUS, "-", i, i + 1);
} else if (c == '+' || c == '\uFF0B') {
addToken(PLUS, "+", i, i + 1);
} else if (c == '.' || c == '\uFF0E') {
addToken(DOT, ".", i, i + 1);
} else if (c == ',' || c == '\uFF0C') {
addToken(COMMA, ",", i, i + 1);
} else if (c == ':' || c == '\uFF1A') {
currentIndex = determineCurrentIndex(defaultIndex, indexFacts);
addToken(COLON, ":", i, i + 1);
} else if (c == '(' || c == '\uFF08') {
addToken(LBRACE, "(", i, i + 1);
parensToEat++;
} else if (c == ')' || c == '\uFF09') {
addToken(RBRACE, ")", i, i + 1);
parensToEat--;
if (parensToEat < 0)
parensToEat = 0;
} else if (c == '[' || c == '\uFF3B') {
addToken(LSQUAREBRACKET, "[", i, i + 1);
} else if (c == ']' || c == '\uFF3D') {
addToken(RSQUAREBRACKET, "]", i, i + 1);
} else if (c == ';' || c == '\uFF1B') {
addToken(SEMICOLON, ";", i, i + 1);
} else if (c == '>' || c == '\uFF1E') {
addToken(GREATER, ">", i, i + 1);
} else if (c == '<' || c == '\uFF1C') {
addToken(SMALLER, "<", i, i + 1);
} else if (c == '!' || c == '\uFF01') {
addToken(EXCLAMATION, "!", i, i + 1);
} else if (c == '_' || c == '\uFF3F') {
addToken(UNDERSCORE, "_", i, i + 1);
} else if (c == '^' || c == '\uFF3E') {
addToken(HAT, "^", i, i + 1);
} else if (c == '*' || c == '\uFF0A') {
addToken(STAR, "*", i, i + 1);
} else if (c == '$' || c == '\uFF04') {
addToken(DOLLAR, "$", i, i + 1);
} else {
addToken(NOISE, "<NOISE>", i, i + 1);
}
}
addToken(EOF, "<EOF>", source.length(), source.length());
source = null;
return tokens;
}
Aggregations