use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.
the class PhraseMatcherTestCase method testPhraseMatchingWithPluralIgnore.
@Test
public void testPhraseMatchingWithPluralIgnore() {
PhraseMatcher matcher = new PhraseMatcher("src/test/java/com/yahoo/prelude/querytransform/test/test-fsa.fsa", true);
AndItem and = new AndItem();
and.addItem(new WordItem("noisebefore"));
and.addItem(new WordItem("thi"));
and.addItem(new WordItem("is"));
and.addItem(new WordItem("a"));
and.addItem(new WordItem("tests"));
and.addItem(new WordItem("noiseafter"));
List<?> matches = matcher.matchPhrases(and);
assertNotNull(matches);
assertEquals(1, matches.size());
PhraseMatcher.Phrase match = (PhraseMatcher.Phrase) matches.get(0);
assertEquals(4, match.getLength());
assertEquals("", match.getData());
assertEquals(and, match.getOwner());
assertEquals(1, match.getStartIndex());
PhraseMatcher.Phrase.MatchIterator i = match.itemIterator();
assertEquals(new WordItem("thi"), i.next());
assertEquals("this", i.getReplace());
assertEquals(new WordItem("is"), i.next());
assertEquals(null, i.getReplace());
assertEquals(new WordItem("a"), i.next());
assertEquals(null, i.getReplace());
assertEquals(new WordItem("tests"), i.next());
assertEquals("test", i.getReplace());
assertFalse(i.hasNext());
}
use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.
the class RewriterFeatures method getNonOverlappingPartialPhraseMatches.
/**
* <p>Retrieve the longest, from left to right non overlapping partial
* phrase substrings in query based on FSA dictionary</p>
*
* e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
* ((sunny AND travel AND agency) OR nyc)<br>
* dictionary: <br>
* mny\tmodern new york<br>
* mo\tmodern<br>
* modern\tn/a<br>
* modern new york\tn/a<br>
* new york\tn/a<br>
* new york city\tn/a<br>
* new york city travel\tn/a<br>
* new york company\tn/a<br>
* ny\tnew york<br>
* nyc\tnew york city\tnew york company<br>
* nyct\tnew york city travel<br>
* ta\ttravel agency<br>
* travel agency\tn/a<br>
* return: <br>
* modern<br>
* new york city travel<br>
* travel agency<br>
* nyc<br>
* @param phraseMatcher PhraseMatcher object loaded with FSA dict
* @param query Query object from the searcher
* @return Matching phrases
*/
public static Set<PhraseMatcher.Phrase> getNonOverlappingPartialPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
RewriterUtils.log(logger, query, "Retrieving longest non-overlapping partial phrase matches");
if (phraseMatcher == null)
return null;
Item root = query.getModel().getQueryTree().getRoot();
List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
if (matches == null || matches.isEmpty())
return null;
Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
ArrayList<PhraseMatcher.Phrase> phrasesInSubTree = new ArrayList<>();
CompositeItem prevOwner = null;
ListIterator<PhraseMatcher.Phrase> matchesIter = matches.listIterator();
// Iterate through all matches
while (matchesIter.hasNext()) {
PhraseMatcher.Phrase phrase = matchesIter.next();
RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
CompositeItem currOwner = phrase.getOwner();
// If so, work on the previous set to eliminate overlapping matches
if (!phrasesInSubTree.isEmpty() && currOwner != null && prevOwner != null && !currOwner.equals(prevOwner)) {
RewriterUtils.log(logger, query, "Previous phrase is in different AND item");
List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
if (subTreeMatches == null) {
RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
throw new RuntimeException("Error retrieving matches from subtree");
}
resultMatches.addAll(subTreeMatches);
phrasesInSubTree.clear();
}
// Check if this is an AND item
if (currOwner != null && currOwner instanceof AndItem) {
phrasesInSubTree.add(phrase);
// If phrase is not an AND item, only keep those that are single word
// in order to eliminate cases such as (new RANK york) from being treated
// as match if only new york but not new or york is in the dictionary
} else if (phrase.getLength() == 1 && !(currOwner != null && currOwner instanceof RankItem && phrase.getStartIndex() != 0)) {
resultMatches.add(phrase);
}
prevOwner = currOwner;
}
// If so, work on the previous set to elimate overlapping matches
if (!phrasesInSubTree.isEmpty()) {
RewriterUtils.log(logger, query, "Last phrase is in AND item");
List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
if (subTreeMatches == null) {
RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
throw new RuntimeException("Error retrieving matches from subtree");
}
resultMatches.addAll(subTreeMatches);
}
RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping partial phrase matches");
return resultMatches;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.
the class RewriterFeatures method getNonOverlappingFullPhraseMatches.
/**
* <p>Retrieve the longest, from left to right non overlapping full
* phrase substrings in query based on FSA dictionary</p>
*
* e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
* ((sunny AND travel AND agency) OR nyc)<br>
* dictionary: <br>
* mny\tmodern new york<br>
* mo\tmodern<br>
* modern\tn/a<br>
* modern\tnew york\tn/a<br>
* new york\tn/a<br>
* new york city\tn/a<br>
* new york city travel\tn/a<br>
* new york company\tn/a<br>
* ny\tnew york<br>
* nyc\tnew york city\tnew york company<br>
* nyct\tnew york city travel<br>
* ta\ttravel agency<br>
* travel agency\tn/a<br>
* return: nyc
* @param phraseMatcher PhraseMatcher object loaded with FSA dict
* @param query Query object from the searcher
* @return Matching phrases
*/
public static Set<PhraseMatcher.Phrase> getNonOverlappingFullPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
RewriterUtils.log(logger, query, "Retrieving longest non-overlapping full phrase matches");
if (phraseMatcher == null)
return null;
Item root = query.getModel().getQueryTree().getRoot();
List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
if (matches == null || matches.isEmpty())
return null;
Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
ListIterator<Phrase> matchesIter = matches.listIterator();
// Iterate through all matches
while (matchesIter.hasNext()) {
PhraseMatcher.Phrase phrase = matchesIter.next();
RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
CompositeItem currOwner = phrase.getOwner();
// as match if only new york but not new or york is in the dictionary
if ((currOwner != null && ((phrase.isComplete() && currOwner instanceof AndItem) || (phrase.getLength() == 1 && currOwner instanceof OrItem) || (phrase.getLength() == 1 && currOwner instanceof RankItem && phrase.getStartIndex() == 0))) || (currOwner == null && phrase.getLength() == 1)) {
resultMatches.add(phrase);
RewriterUtils.log(logger, query, "Keeping phrase: " + phrase);
}
}
RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping full phrase matches");
return resultMatches;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.
the class GenericExpansionRewriter method configure.
/**
* Instance creation time config loading besides FSA.
* Create PhraseMatcher from FSA dict
*/
public boolean configure(FileAcquirer fileAcquirer, RewritesConfig config, HashMap<String, File> fileList) {
logger = Logger.getLogger(GenericExpansionRewriter.class.getName());
FSA fsa = (FSA) rewriterDicts.get(GENERIC_EXPAND_DICT);
if (fsa == null) {
RewriterUtils.error(logger, "Error retrieving FSA dictionary: " + GENERIC_EXPAND_DICT);
return false;
}
// Create Phrase Matcher
RewriterUtils.log(logger, "Creating PhraseMatcher");
try {
phraseMatcher = new PhraseMatcher(fsa, false);
} catch (IllegalArgumentException e) {
RewriterUtils.error(logger, "Error creating phrase matcher");
return false;
}
// Match single word as well
phraseMatcher.setMatchSingleItems(true);
// Return all matches instead of only the longest match
phraseMatcher.setMatchAll(true);
return true;
}
use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.
the class PhraseMatcherTestCase method testPhraseMatchingCaseInsensitiveWithPluralIgnore.
@Test
public void testPhraseMatchingCaseInsensitiveWithPluralIgnore() {
PhraseMatcher matcher = new PhraseMatcher("src/test/java/com/yahoo/prelude/querytransform/test/test-fsa.fsa", true);
AndItem and = new AndItem();
and.addItem(new WordItem("noisebefore"));
final String firstWord = "thI";
and.addItem(new WordItem(firstWord));
final String secondWord = "Is";
and.addItem(new WordItem(secondWord));
final String thirdWord = "A";
and.addItem(new WordItem(thirdWord));
final String fourthWord = "tEsts";
and.addItem(new WordItem(fourthWord));
and.addItem(new WordItem("noiseafter"));
List<?> matches = matcher.matchPhrases(and);
assertNotNull(matches);
assertEquals(1, matches.size());
PhraseMatcher.Phrase match = (PhraseMatcher.Phrase) matches.get(0);
assertEquals(4, match.getLength());
assertEquals("", match.getData());
assertEquals(and, match.getOwner());
assertEquals(1, match.getStartIndex());
PhraseMatcher.Phrase.MatchIterator i = match.itemIterator();
assertEquals(new WordItem(firstWord), i.next());
assertEquals("this", i.getReplace());
assertEquals(new WordItem(secondWord), i.next());
assertEquals(null, i.getReplace());
assertEquals(new WordItem(thirdWord), i.next());
assertEquals(null, i.getReplace());
assertEquals(new WordItem(fourthWord), i.next());
assertEquals("test", i.getReplace());
assertFalse(i.hasNext());
}
Aggregations