use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.
the class RewriterFeatures method getNonOverlappingPartialPhraseMatches.
/**
* <p>Retrieve the longest, from left to right non overlapping partial
* phrase substrings in query based on FSA dictionary</p>
*
* e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
* ((sunny AND travel AND agency) OR nyc)<br>
* dictionary: <br>
* mny\tmodern new york<br>
* mo\tmodern<br>
* modern\tn/a<br>
* modern new york\tn/a<br>
* new york\tn/a<br>
* new york city\tn/a<br>
* new york city travel\tn/a<br>
* new york company\tn/a<br>
* ny\tnew york<br>
* nyc\tnew york city\tnew york company<br>
* nyct\tnew york city travel<br>
* ta\ttravel agency<br>
* travel agency\tn/a<br>
* return: <br>
* modern<br>
* new york city travel<br>
* travel agency<br>
* nyc<br>
* @param phraseMatcher PhraseMatcher object loaded with FSA dict
* @param query Query object from the searcher
* @return Matching phrases
*/
public static Set<PhraseMatcher.Phrase> getNonOverlappingPartialPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
RewriterUtils.log(logger, query, "Retrieving longest non-overlapping partial phrase matches");
if (phraseMatcher == null)
return null;
Item root = query.getModel().getQueryTree().getRoot();
List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
if (matches == null || matches.isEmpty())
return null;
Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
ArrayList<PhraseMatcher.Phrase> phrasesInSubTree = new ArrayList<>();
CompositeItem prevOwner = null;
ListIterator<PhraseMatcher.Phrase> matchesIter = matches.listIterator();
// Iterate through all matches
while (matchesIter.hasNext()) {
PhraseMatcher.Phrase phrase = matchesIter.next();
RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
CompositeItem currOwner = phrase.getOwner();
// If so, work on the previous set to eliminate overlapping matches
if (!phrasesInSubTree.isEmpty() && currOwner != null && prevOwner != null && !currOwner.equals(prevOwner)) {
RewriterUtils.log(logger, query, "Previous phrase is in different AND item");
List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
if (subTreeMatches == null) {
RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
throw new RuntimeException("Error retrieving matches from subtree");
}
resultMatches.addAll(subTreeMatches);
phrasesInSubTree.clear();
}
// Check if this is an AND item
if (currOwner != null && currOwner instanceof AndItem) {
phrasesInSubTree.add(phrase);
// If phrase is not an AND item, only keep those that are single word
// in order to eliminate cases such as (new RANK york) from being treated
// as match if only new york but not new or york is in the dictionary
} else if (phrase.getLength() == 1 && !(currOwner != null && currOwner instanceof RankItem && phrase.getStartIndex() != 0)) {
resultMatches.add(phrase);
}
prevOwner = currOwner;
}
// If so, work on the previous set to elimate overlapping matches
if (!phrasesInSubTree.isEmpty()) {
RewriterUtils.log(logger, query, "Last phrase is in AND item");
List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
if (subTreeMatches == null) {
RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
throw new RuntimeException("Error retrieving matches from subtree");
}
resultMatches.addAll(subTreeMatches);
}
RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping partial phrase matches");
return resultMatches;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.
the class RewriterFeatures method getNonOverlappingFullPhraseMatches.
/**
* <p>Retrieve the longest, from left to right non overlapping full
* phrase substrings in query based on FSA dictionary</p>
*
* e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
* ((sunny AND travel AND agency) OR nyc)<br>
* dictionary: <br>
* mny\tmodern new york<br>
* mo\tmodern<br>
* modern\tn/a<br>
* modern\tnew york\tn/a<br>
* new york\tn/a<br>
* new york city\tn/a<br>
* new york city travel\tn/a<br>
* new york company\tn/a<br>
* ny\tnew york<br>
* nyc\tnew york city\tnew york company<br>
* nyct\tnew york city travel<br>
* ta\ttravel agency<br>
* travel agency\tn/a<br>
* return: nyc
* @param phraseMatcher PhraseMatcher object loaded with FSA dict
* @param query Query object from the searcher
* @return Matching phrases
*/
public static Set<PhraseMatcher.Phrase> getNonOverlappingFullPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
RewriterUtils.log(logger, query, "Retrieving longest non-overlapping full phrase matches");
if (phraseMatcher == null)
return null;
Item root = query.getModel().getQueryTree().getRoot();
List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
if (matches == null || matches.isEmpty())
return null;
Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
ListIterator<Phrase> matchesIter = matches.listIterator();
// Iterate through all matches
while (matchesIter.hasNext()) {
PhraseMatcher.Phrase phrase = matchesIter.next();
RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
CompositeItem currOwner = phrase.getOwner();
// as match if only new york but not new or york is in the dictionary
if ((currOwner != null && ((phrase.isComplete() && currOwner instanceof AndItem) || (phrase.getLength() == 1 && currOwner instanceof OrItem) || (phrase.getLength() == 1 && currOwner instanceof RankItem && phrase.getStartIndex() == 0))) || (currOwner == null && phrase.getLength() == 1)) {
resultMatches.add(phrase);
RewriterUtils.log(logger, query, "Keeping phrase: " + phrase);
}
}
RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping full phrase matches");
return resultMatches;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.
the class RewriterFeatures method getNonOverlappingMatchesInAndItem.
/**
* <p>Retrieve the longest, from left to right non overlapping substrings in
* AndItem based on FSA dictionary</p>
*
* e.g. subtree: (modern AND new AND york AND city AND travel)<br>
* dictionary:<br>
* mny\tmodern new york<br>
* mo\tmodern<br>
* modern\tn/a<br>
* modern new york\tn/a<br>
* new york\tn/a<br>
* new york city\tn/a<br>
* new york city travel\tn/a<br>
* new york company\tn/a<br>
* ny\tnew york<br>
* nyc\tnew york city\tnew york company<br>
* nyct\tnew york city travel<br>
* allMatches:<br>
* modern<br>
* modern new york<br>
* new york<br>
* new york city<br>
* new york city travel<br>
* return: <br>
* modern<br>
* new york city travel<br>
* @param allMatches All matches within the subtree
* @param query Query object from the searcher
* @return Matching phrases
*/
public static List<PhraseMatcher.Phrase> getNonOverlappingMatchesInAndItem(List<PhraseMatcher.Phrase> allMatches, Query query) throws RuntimeException {
RewriterUtils.log(logger, query, "Retrieving longest non-overlapping matches in subtree");
if (allMatches == null || allMatches.isEmpty())
return null;
if (allMatches.size() == 1) {
RewriterUtils.log(logger, query, "Only one match in subtree");
return allMatches;
}
// Phrase are sorted based on length, if both have the
// same length, the lefter one ranks higher
RewriterUtils.log(logger, query, "Sorting the phrases");
PhraseLength phraseLength = new PhraseLength();
Collections.sort(allMatches, phraseLength);
// Create a bitset with length equal to the number of
// items in the subtree
int numWords = allMatches.get(0).getOwner().getItemCount();
BitSet matchPos = new BitSet(numWords);
// Removing matches that are overlapping with previously selected ones
RewriterUtils.log(logger, query, "Removing matches that are overlapping " + "with previously selected ones");
ListIterator<Phrase> allMatchesIter = allMatches.listIterator();
while (allMatchesIter.hasNext()) {
PhraseMatcher.Phrase currMatch = allMatchesIter.next();
PhraseMatcher.Phrase.MatchIterator matchIter = currMatch.itemIterator();
if (matchIter.hasNext() && matchIter.next().isFilter()) {
RewriterUtils.log(logger, query, "Removing filter item" + currMatch);
allMatchesIter.remove();
continue;
}
BitSet currMatchPos = new BitSet(numWords);
currMatchPos.set(currMatch.getStartIndex(), currMatch.getLength() + currMatch.getStartIndex());
if (currMatchPos.intersects(matchPos)) {
RewriterUtils.log(logger, query, "Removing " + currMatch);
allMatchesIter.remove();
} else {
RewriterUtils.log(logger, query, "Keeping " + currMatch);
matchPos.or(currMatchPos);
}
}
return allMatches;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.
the class RewriterFeatures method addExpansions.
/**
* <p>Add Expansions to the matching phrases</p>
*
* e.g. Query: nyc travel agency<br>
* matching phrase: nyc\tnew york city\tnew york company
* travel agency\tn/a<br>
* if expandIndex is not null and removeOriginal is true<br>
* New Query: ((new york city) OR ([expandIndex]:new york city)
* OR (new york company) OR
* ([expandIndex]:new york company)) AND
* ((travel agency) OR ([expandIndex]:travel agency))<br>
* if expandIndex is null and removeOriginal is true<br>
* New Query: ((new york city) OR (new york company)) AND
* travel agency<br>
* if expandIndex is null and removeOriginal is false<br>
* New Query: (nyc OR (new york city) OR (new york company)) AND
* travel agency<br>
*
* @param query Query object from searcher
* @param matches Set of longest non-overlapping matches
* @param expandIndex Name of expansion index or null if
* default index
* @param maxNumRewrites Max number of rewrites to be added,
* 0 if no limit
* @param removeOriginal Whether to remove the original matching phrase
* @param addUnitToRewrites Whether to add rewrite as phrase
*/
public static Query addExpansions(Query query, Set<PhraseMatcher.Phrase> matches, String expandIndex, int maxNumRewrites, boolean removeOriginal, boolean addUnitToRewrites) throws RuntimeException {
if (matches == null) {
RewriterUtils.log(logger, query, "No expansions to be added");
return query;
}
RewriterUtils.log(logger, query, "Adding expansions to matching phrases");
Model queryModel = query.getModel();
QueryTree qTree = queryModel.getQueryTree();
Iterator<Phrase> matchesIter = matches.iterator();
CompositeItem parent = null;
// Iterate through all matches
while (matchesIter.hasNext()) {
PhraseMatcher.Phrase match = matchesIter.next();
RewriterUtils.log(logger, query, "Working on phrase: " + match);
// Retrieve expansion phrases
String expansionStr = match.getData();
if (expansionStr.equalsIgnoreCase("n/a") && expandIndex == null) {
continue;
}
StringTokenizer expansions = new StringTokenizer(expansionStr, "\t");
// Create this structure for all expansions of this match
// (OR (AND expandsion1) indexName:expansion1
// (AND expansion2) indexName:expansion2..)
OrItem expansionGrp = new OrItem();
int numRewrites = 0;
String matchStr = convertMatchToString(match);
while (expansions.hasMoreTokens() && (maxNumRewrites == 0 || numRewrites < maxNumRewrites)) {
String expansion = expansions.nextToken();
RewriterUtils.log(logger, query, "Working on expansion: " + expansion);
if (expansion.equalsIgnoreCase("n/a")) {
expansion = matchStr;
}
// (AND expansion) or "expansion"
Item expansionItem = convertStringToQTree(query, expansion);
if (addUnitToRewrites && expansionItem instanceof AndItem) {
expansionItem = convertAndToPhrase((AndItem) expansionItem);
}
expansionGrp.addItem(expansionItem);
if (expandIndex != null) {
// indexName:expansion
WordItem expansionIndexItem = new WordItem(expansion, expandIndex);
expansionGrp.addItem(expansionIndexItem);
}
numRewrites++;
RewriterUtils.log(logger, query, "Adding expansion: " + expansion);
}
if (!removeOriginal) {
// (AND original)
Item matchItem = convertStringToQTree(query, matchStr);
if (expansionGrp.getItemIndex(matchItem) == -1) {
expansionGrp.addItem(matchItem);
}
}
parent = match.getOwner();
int matchIndex = match.getStartIndex();
if (parent != null) {
// Remove matching phrase from original query
for (int i = 0; i < match.getLength(); i++) {
parent.removeItem(matchIndex);
}
// Adding back expansions
parent.addItem(matchIndex, expansionGrp);
} else {
RewriterUtils.log(logger, query, "Single root item");
// If there's no parent, i.e. single root item
qTree.setRoot(expansionGrp);
break;
}
}
// Not root single item
if (parent != null) {
// Cleaning up the query after rewrite to remove redundant tags
// e.g. (AND (OR (AND a b) c)) => (OR (AND a b) c)
String cleanupError = QueryCanonicalizer.canonicalize(qTree);
if (cleanupError != null) {
RewriterUtils.error(logger, query, "Error canonicalizing query tree");
throw new RuntimeException("Error canonicalizing query tree");
}
}
// set type=adv
queryModel.setType(Query.Type.ADVANCED);
RewriterUtils.log(logger, query, "Successfully added expansions to matching phrases");
return query;
}
Aggregations