Search in sources :

Example 6 with PhraseMatcher

use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.

the class PhraseMatcherTestCase method testPhraseMatchingWithPluralIgnore.

@Test
public void testPhraseMatchingWithPluralIgnore() {
    PhraseMatcher matcher = new PhraseMatcher("src/test/java/com/yahoo/prelude/querytransform/test/test-fsa.fsa", true);
    AndItem and = new AndItem();
    and.addItem(new WordItem("noisebefore"));
    and.addItem(new WordItem("thi"));
    and.addItem(new WordItem("is"));
    and.addItem(new WordItem("a"));
    and.addItem(new WordItem("tests"));
    and.addItem(new WordItem("noiseafter"));
    List<?> matches = matcher.matchPhrases(and);
    assertNotNull(matches);
    assertEquals(1, matches.size());
    PhraseMatcher.Phrase match = (PhraseMatcher.Phrase) matches.get(0);
    assertEquals(4, match.getLength());
    assertEquals("", match.getData());
    assertEquals(and, match.getOwner());
    assertEquals(1, match.getStartIndex());
    PhraseMatcher.Phrase.MatchIterator i = match.itemIterator();
    assertEquals(new WordItem("thi"), i.next());
    assertEquals("this", i.getReplace());
    assertEquals(new WordItem("is"), i.next());
    assertEquals(null, i.getReplace());
    assertEquals(new WordItem("a"), i.next());
    assertEquals(null, i.getReplace());
    assertEquals(new WordItem("tests"), i.next());
    assertEquals("test", i.getReplace());
    assertFalse(i.hasNext());
}
Also used : AndItem(com.yahoo.prelude.query.AndItem) WordItem(com.yahoo.prelude.query.WordItem) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher) Test(org.junit.Test)

Example 7 with PhraseMatcher

use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.

the class RewriterFeatures method getNonOverlappingPartialPhraseMatches.

/**
 * <p>Retrieve the longest, from left to right non overlapping partial
 * phrase substrings in query based on FSA dictionary</p>
 *
 * e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
 *             ((sunny AND travel AND agency) OR nyc)<br>
 *      dictionary: <br>
 *                  mny\tmodern new york<br>
 *                  mo\tmodern<br>
 *                  modern\tn/a<br>
 *                  modern new york\tn/a<br>
 *                  new york\tn/a<br>
 *                  new york city\tn/a<br>
 *                  new york city travel\tn/a<br>
 *                  new york company\tn/a<br>
 *                  ny\tnew york<br>
 *                  nyc\tnew york city\tnew york company<br>
 *                  nyct\tnew york city travel<br>
 *                  ta\ttravel agency<br>
 *                  travel agency\tn/a<br>
 *      return: <br>
 *              modern<br>
 *              new york city travel<br>
 *              travel agency<br>
 *              nyc<br>
 * @param phraseMatcher PhraseMatcher object loaded with FSA dict
 * @param query Query object from the searcher
 * @return Matching phrases
 */
public static Set<PhraseMatcher.Phrase> getNonOverlappingPartialPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
    RewriterUtils.log(logger, query, "Retrieving longest non-overlapping partial phrase matches");
    if (phraseMatcher == null)
        return null;
    Item root = query.getModel().getQueryTree().getRoot();
    List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
    if (matches == null || matches.isEmpty())
        return null;
    Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
    ArrayList<PhraseMatcher.Phrase> phrasesInSubTree = new ArrayList<>();
    CompositeItem prevOwner = null;
    ListIterator<PhraseMatcher.Phrase> matchesIter = matches.listIterator();
    // Iterate through all matches
    while (matchesIter.hasNext()) {
        PhraseMatcher.Phrase phrase = matchesIter.next();
        RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
        CompositeItem currOwner = phrase.getOwner();
        // If so, work on the previous set to eliminate overlapping matches
        if (!phrasesInSubTree.isEmpty() && currOwner != null && prevOwner != null && !currOwner.equals(prevOwner)) {
            RewriterUtils.log(logger, query, "Previous phrase is in different AND item");
            List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
            if (subTreeMatches == null) {
                RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
                throw new RuntimeException("Error retrieving matches from subtree");
            }
            resultMatches.addAll(subTreeMatches);
            phrasesInSubTree.clear();
        }
        // Check if this is an AND item
        if (currOwner != null && currOwner instanceof AndItem) {
            phrasesInSubTree.add(phrase);
        // If phrase is not an AND item, only keep those that are single word
        // in order to eliminate cases such as (new RANK york) from being treated
        // as match if only new york but not new or york is in the dictionary
        } else if (phrase.getLength() == 1 && !(currOwner != null && currOwner instanceof RankItem && phrase.getStartIndex() != 0)) {
            resultMatches.add(phrase);
        }
        prevOwner = currOwner;
    }
    // If so, work on the previous set to elimate overlapping matches
    if (!phrasesInSubTree.isEmpty()) {
        RewriterUtils.log(logger, query, "Last phrase is in AND item");
        List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
        if (subTreeMatches == null) {
            RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
            throw new RuntimeException("Error retrieving matches from subtree");
        }
        resultMatches.addAll(subTreeMatches);
    }
    RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping partial phrase matches");
    return resultMatches;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 8 with PhraseMatcher

use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.

the class RewriterFeatures method getNonOverlappingFullPhraseMatches.

/**
 * <p>Retrieve the longest, from left to right non overlapping full
 * phrase substrings in query based on FSA dictionary</p>
 *
 * e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
 *             ((sunny AND travel AND agency) OR nyc)<br>
 *      dictionary: <br>
 *                  mny\tmodern new york<br>
 *                  mo\tmodern<br>
 *                  modern\tn/a<br>
 *                  modern\tnew york\tn/a<br>
 *                  new york\tn/a<br>
 *                  new york city\tn/a<br>
 *                  new york city travel\tn/a<br>
 *                  new york company\tn/a<br>
 *                  ny\tnew york<br>
 *                  nyc\tnew york city\tnew york company<br>
 *                  nyct\tnew york city travel<br>
 *                  ta\ttravel agency<br>
 *                  travel agency\tn/a<br>
 *      return: nyc
 * @param phraseMatcher PhraseMatcher object loaded with FSA dict
 * @param query Query object from the searcher
 * @return Matching phrases
 */
public static Set<PhraseMatcher.Phrase> getNonOverlappingFullPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
    RewriterUtils.log(logger, query, "Retrieving longest non-overlapping full phrase matches");
    if (phraseMatcher == null)
        return null;
    Item root = query.getModel().getQueryTree().getRoot();
    List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
    if (matches == null || matches.isEmpty())
        return null;
    Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
    ListIterator<Phrase> matchesIter = matches.listIterator();
    // Iterate through all matches
    while (matchesIter.hasNext()) {
        PhraseMatcher.Phrase phrase = matchesIter.next();
        RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
        CompositeItem currOwner = phrase.getOwner();
        // as match if only new york but not new or york is in the dictionary
        if ((currOwner != null && ((phrase.isComplete() && currOwner instanceof AndItem) || (phrase.getLength() == 1 && currOwner instanceof OrItem) || (phrase.getLength() == 1 && currOwner instanceof RankItem && phrase.getStartIndex() == 0))) || (currOwner == null && phrase.getLength() == 1)) {
            resultMatches.add(phrase);
            RewriterUtils.log(logger, query, "Keeping phrase: " + phrase);
        }
    }
    RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping full phrase matches");
    return resultMatches;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 9 with PhraseMatcher

use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.

the class GenericExpansionRewriter method configure.

/**
 * Instance creation time config loading besides FSA.
 * Create PhraseMatcher from FSA dict
 */
public boolean configure(FileAcquirer fileAcquirer, RewritesConfig config, HashMap<String, File> fileList) {
    logger = Logger.getLogger(GenericExpansionRewriter.class.getName());
    FSA fsa = (FSA) rewriterDicts.get(GENERIC_EXPAND_DICT);
    if (fsa == null) {
        RewriterUtils.error(logger, "Error retrieving FSA dictionary: " + GENERIC_EXPAND_DICT);
        return false;
    }
    // Create Phrase Matcher
    RewriterUtils.log(logger, "Creating PhraseMatcher");
    try {
        phraseMatcher = new PhraseMatcher(fsa, false);
    } catch (IllegalArgumentException e) {
        RewriterUtils.error(logger, "Error creating phrase matcher");
        return false;
    }
    // Match single word as well
    phraseMatcher.setMatchSingleItems(true);
    // Return all matches instead of only the longest match
    phraseMatcher.setMatchAll(true);
    return true;
}
Also used : FSA(com.yahoo.fsa.FSA) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 10 with PhraseMatcher

use of com.yahoo.prelude.querytransform.PhraseMatcher in project vespa by vespa-engine.

the class PhraseMatcherTestCase method testPhraseMatchingCaseInsensitiveWithPluralIgnore.

@Test
public void testPhraseMatchingCaseInsensitiveWithPluralIgnore() {
    PhraseMatcher matcher = new PhraseMatcher("src/test/java/com/yahoo/prelude/querytransform/test/test-fsa.fsa", true);
    AndItem and = new AndItem();
    and.addItem(new WordItem("noisebefore"));
    final String firstWord = "thI";
    and.addItem(new WordItem(firstWord));
    final String secondWord = "Is";
    and.addItem(new WordItem(secondWord));
    final String thirdWord = "A";
    and.addItem(new WordItem(thirdWord));
    final String fourthWord = "tEsts";
    and.addItem(new WordItem(fourthWord));
    and.addItem(new WordItem("noiseafter"));
    List<?> matches = matcher.matchPhrases(and);
    assertNotNull(matches);
    assertEquals(1, matches.size());
    PhraseMatcher.Phrase match = (PhraseMatcher.Phrase) matches.get(0);
    assertEquals(4, match.getLength());
    assertEquals("", match.getData());
    assertEquals(and, match.getOwner());
    assertEquals(1, match.getStartIndex());
    PhraseMatcher.Phrase.MatchIterator i = match.itemIterator();
    assertEquals(new WordItem(firstWord), i.next());
    assertEquals("this", i.getReplace());
    assertEquals(new WordItem(secondWord), i.next());
    assertEquals(null, i.getReplace());
    assertEquals(new WordItem(thirdWord), i.next());
    assertEquals(null, i.getReplace());
    assertEquals(new WordItem(fourthWord), i.next());
    assertEquals("test", i.getReplace());
    assertFalse(i.hasNext());
}
Also used : AndItem(com.yahoo.prelude.query.AndItem) WordItem(com.yahoo.prelude.query.WordItem) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher) Test(org.junit.Test)

Aggregations

PhraseMatcher (com.yahoo.prelude.querytransform.PhraseMatcher)13 WordItem (com.yahoo.prelude.query.WordItem)9 Test (org.junit.Test)9 AndItem (com.yahoo.prelude.query.AndItem)5 Phrase (com.yahoo.prelude.querytransform.PhraseMatcher.Phrase)2 FSA (com.yahoo.fsa.FSA)1 IntItem (com.yahoo.prelude.query.IntItem)1 File (java.io.File)1