Search in sources :

Example 1 with Phrase

use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.

the class RewriterFeatures method getNonOverlappingPartialPhraseMatches.

/**
 * <p>Retrieve the longest, from left to right non overlapping partial
 * phrase substrings in query based on FSA dictionary</p>
 *
 * e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
 *             ((sunny AND travel AND agency) OR nyc)<br>
 *      dictionary: <br>
 *                  mny\tmodern new york<br>
 *                  mo\tmodern<br>
 *                  modern\tn/a<br>
 *                  modern new york\tn/a<br>
 *                  new york\tn/a<br>
 *                  new york city\tn/a<br>
 *                  new york city travel\tn/a<br>
 *                  new york company\tn/a<br>
 *                  ny\tnew york<br>
 *                  nyc\tnew york city\tnew york company<br>
 *                  nyct\tnew york city travel<br>
 *                  ta\ttravel agency<br>
 *                  travel agency\tn/a<br>
 *      return: <br>
 *              modern<br>
 *              new york city travel<br>
 *              travel agency<br>
 *              nyc<br>
 * @param phraseMatcher PhraseMatcher object loaded with FSA dict
 * @param query Query object from the searcher
 * @return Matching phrases
 */
public static Set<PhraseMatcher.Phrase> getNonOverlappingPartialPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
    RewriterUtils.log(logger, query, "Retrieving longest non-overlapping partial phrase matches");
    if (phraseMatcher == null)
        return null;
    Item root = query.getModel().getQueryTree().getRoot();
    List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
    if (matches == null || matches.isEmpty())
        return null;
    Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
    ArrayList<PhraseMatcher.Phrase> phrasesInSubTree = new ArrayList<>();
    CompositeItem prevOwner = null;
    ListIterator<PhraseMatcher.Phrase> matchesIter = matches.listIterator();
    // Iterate through all matches
    while (matchesIter.hasNext()) {
        PhraseMatcher.Phrase phrase = matchesIter.next();
        RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
        CompositeItem currOwner = phrase.getOwner();
        // If so, work on the previous set to eliminate overlapping matches
        if (!phrasesInSubTree.isEmpty() && currOwner != null && prevOwner != null && !currOwner.equals(prevOwner)) {
            RewriterUtils.log(logger, query, "Previous phrase is in different AND item");
            List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
            if (subTreeMatches == null) {
                RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
                throw new RuntimeException("Error retrieving matches from subtree");
            }
            resultMatches.addAll(subTreeMatches);
            phrasesInSubTree.clear();
        }
        // Check if this is an AND item
        if (currOwner != null && currOwner instanceof AndItem) {
            phrasesInSubTree.add(phrase);
        // If phrase is not an AND item, only keep those that are single word
        // in order to eliminate cases such as (new RANK york) from being treated
        // as match if only new york but not new or york is in the dictionary
        } else if (phrase.getLength() == 1 && !(currOwner != null && currOwner instanceof RankItem && phrase.getStartIndex() != 0)) {
            resultMatches.add(phrase);
        }
        prevOwner = currOwner;
    }
    // If so, work on the previous set to elimate overlapping matches
    if (!phrasesInSubTree.isEmpty()) {
        RewriterUtils.log(logger, query, "Last phrase is in AND item");
        List<PhraseMatcher.Phrase> subTreeMatches = getNonOverlappingMatchesInAndItem(phrasesInSubTree, query);
        if (subTreeMatches == null) {
            RewriterUtils.error(logger, query, "Error retrieving matches from subtree");
            throw new RuntimeException("Error retrieving matches from subtree");
        }
        resultMatches.addAll(subTreeMatches);
    }
    RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping partial phrase matches");
    return resultMatches;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 2 with Phrase

use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.

the class RewriterFeatures method getNonOverlappingFullPhraseMatches.

/**
 * <p>Retrieve the longest, from left to right non overlapping full
 * phrase substrings in query based on FSA dictionary</p>
 *
 * e.g. query: ((modern AND new AND york AND city AND travel) OR travel) AND
 *             ((sunny AND travel AND agency) OR nyc)<br>
 *      dictionary: <br>
 *                  mny\tmodern new york<br>
 *                  mo\tmodern<br>
 *                  modern\tn/a<br>
 *                  modern\tnew york\tn/a<br>
 *                  new york\tn/a<br>
 *                  new york city\tn/a<br>
 *                  new york city travel\tn/a<br>
 *                  new york company\tn/a<br>
 *                  ny\tnew york<br>
 *                  nyc\tnew york city\tnew york company<br>
 *                  nyct\tnew york city travel<br>
 *                  ta\ttravel agency<br>
 *                  travel agency\tn/a<br>
 *      return: nyc
 * @param phraseMatcher PhraseMatcher object loaded with FSA dict
 * @param query Query object from the searcher
 * @return Matching phrases
 */
public static Set<PhraseMatcher.Phrase> getNonOverlappingFullPhraseMatches(PhraseMatcher phraseMatcher, Query query) throws RuntimeException {
    RewriterUtils.log(logger, query, "Retrieving longest non-overlapping full phrase matches");
    if (phraseMatcher == null)
        return null;
    Item root = query.getModel().getQueryTree().getRoot();
    List<PhraseMatcher.Phrase> matches = phraseMatcher.matchPhrases(root);
    if (matches == null || matches.isEmpty())
        return null;
    Set<PhraseMatcher.Phrase> resultMatches = new HashSet<>();
    ListIterator<Phrase> matchesIter = matches.listIterator();
    // Iterate through all matches
    while (matchesIter.hasNext()) {
        PhraseMatcher.Phrase phrase = matchesIter.next();
        RewriterUtils.log(logger, query, "Working on phrase: " + phrase);
        CompositeItem currOwner = phrase.getOwner();
        // as match if only new york but not new or york is in the dictionary
        if ((currOwner != null && ((phrase.isComplete() && currOwner instanceof AndItem) || (phrase.getLength() == 1 && currOwner instanceof OrItem) || (phrase.getLength() == 1 && currOwner instanceof RankItem && phrase.getStartIndex() == 0))) || (currOwner == null && phrase.getLength() == 1)) {
            resultMatches.add(phrase);
            RewriterUtils.log(logger, query, "Keeping phrase: " + phrase);
        }
    }
    RewriterUtils.log(logger, query, "Successfully Retrieved longest non-overlapping full phrase matches");
    return resultMatches;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 3 with Phrase

use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.

the class RewriterFeatures method getNonOverlappingMatchesInAndItem.

/**
 * <p>Retrieve the longest, from left to right non overlapping substrings in
 * AndItem based on FSA dictionary</p>
 *
 * e.g. subtree: (modern AND new AND york AND city AND travel)<br>
 *      dictionary:<br>
 *                  mny\tmodern new york<br>
 *                  mo\tmodern<br>
 *                  modern\tn/a<br>
 *                  modern new york\tn/a<br>
 *                  new york\tn/a<br>
 *                  new york city\tn/a<br>
 *                  new york city travel\tn/a<br>
 *                  new york company\tn/a<br>
 *                  ny\tnew york<br>
 *                  nyc\tnew york city\tnew york company<br>
 *                  nyct\tnew york city travel<br>
 *      allMatches:<br>
 *                  modern<br>
 *                  modern new york<br>
 *                  new york<br>
 *                  new york city<br>
 *                  new york city travel<br>
 *      return: <br>
 *              modern<br>
 *              new york city travel<br>
 * @param allMatches All matches within the subtree
 * @param query Query object from the searcher
 * @return Matching phrases
 */
public static List<PhraseMatcher.Phrase> getNonOverlappingMatchesInAndItem(List<PhraseMatcher.Phrase> allMatches, Query query) throws RuntimeException {
    RewriterUtils.log(logger, query, "Retrieving longest non-overlapping matches in subtree");
    if (allMatches == null || allMatches.isEmpty())
        return null;
    if (allMatches.size() == 1) {
        RewriterUtils.log(logger, query, "Only one match in subtree");
        return allMatches;
    }
    // Phrase are sorted based on length, if both have the
    // same length, the lefter one ranks higher
    RewriterUtils.log(logger, query, "Sorting the phrases");
    PhraseLength phraseLength = new PhraseLength();
    Collections.sort(allMatches, phraseLength);
    // Create a bitset with length equal to the number of
    // items in the subtree
    int numWords = allMatches.get(0).getOwner().getItemCount();
    BitSet matchPos = new BitSet(numWords);
    // Removing matches that are overlapping with previously selected ones
    RewriterUtils.log(logger, query, "Removing matches that are overlapping " + "with previously selected ones");
    ListIterator<Phrase> allMatchesIter = allMatches.listIterator();
    while (allMatchesIter.hasNext()) {
        PhraseMatcher.Phrase currMatch = allMatchesIter.next();
        PhraseMatcher.Phrase.MatchIterator matchIter = currMatch.itemIterator();
        if (matchIter.hasNext() && matchIter.next().isFilter()) {
            RewriterUtils.log(logger, query, "Removing filter item" + currMatch);
            allMatchesIter.remove();
            continue;
        }
        BitSet currMatchPos = new BitSet(numWords);
        currMatchPos.set(currMatch.getStartIndex(), currMatch.getLength() + currMatch.getStartIndex());
        if (currMatchPos.intersects(matchPos)) {
            RewriterUtils.log(logger, query, "Removing " + currMatch);
            allMatchesIter.remove();
        } else {
            RewriterUtils.log(logger, query, "Keeping " + currMatch);
            matchPos.or(currMatchPos);
        }
    }
    return allMatches;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Example 4 with Phrase

use of com.yahoo.prelude.querytransform.PhraseMatcher.Phrase in project vespa by vespa-engine.

the class RewriterFeatures method addExpansions.

/**
 * <p>Add Expansions to the matching phrases</p>
 *
 * e.g. Query: nyc travel agency<br>
 *      matching phrase: nyc\tnew york city\tnew york company
 *                       travel agency\tn/a<br>
 *      if expandIndex is not null and removeOriginal is true<br>
 *      New Query: ((new york city) OR ([expandIndex]:new york city)
 *                  OR (new york company) OR
 *                  ([expandIndex]:new york company)) AND
 *                 ((travel agency) OR ([expandIndex]:travel agency))<br>
 *      if expandIndex is null and removeOriginal is true<br>
 *      New Query: ((new york city) OR (new york company)) AND
 *                 travel agency<br>
 *      if expandIndex is null and removeOriginal is false<br>
 *      New Query: (nyc OR (new york city) OR (new york company)) AND
 *                 travel agency<br>
 *
 * @param query Query object from searcher
 * @param matches Set of longest non-overlapping matches
 * @param expandIndex Name of expansion index or null if
 *                    default index
 * @param maxNumRewrites Max number of rewrites to be added,
 *                       0 if no limit
 * @param removeOriginal Whether to remove the original matching phrase
 * @param addUnitToRewrites Whether to add rewrite as phrase
 */
public static Query addExpansions(Query query, Set<PhraseMatcher.Phrase> matches, String expandIndex, int maxNumRewrites, boolean removeOriginal, boolean addUnitToRewrites) throws RuntimeException {
    if (matches == null) {
        RewriterUtils.log(logger, query, "No expansions to be added");
        return query;
    }
    RewriterUtils.log(logger, query, "Adding expansions to matching phrases");
    Model queryModel = query.getModel();
    QueryTree qTree = queryModel.getQueryTree();
    Iterator<Phrase> matchesIter = matches.iterator();
    CompositeItem parent = null;
    // Iterate through all matches
    while (matchesIter.hasNext()) {
        PhraseMatcher.Phrase match = matchesIter.next();
        RewriterUtils.log(logger, query, "Working on phrase: " + match);
        // Retrieve expansion phrases
        String expansionStr = match.getData();
        if (expansionStr.equalsIgnoreCase("n/a") && expandIndex == null) {
            continue;
        }
        StringTokenizer expansions = new StringTokenizer(expansionStr, "\t");
        // Create this structure for all expansions of this match
        // (OR (AND expandsion1) indexName:expansion1
        // (AND expansion2) indexName:expansion2..)
        OrItem expansionGrp = new OrItem();
        int numRewrites = 0;
        String matchStr = convertMatchToString(match);
        while (expansions.hasMoreTokens() && (maxNumRewrites == 0 || numRewrites < maxNumRewrites)) {
            String expansion = expansions.nextToken();
            RewriterUtils.log(logger, query, "Working on expansion: " + expansion);
            if (expansion.equalsIgnoreCase("n/a")) {
                expansion = matchStr;
            }
            // (AND expansion) or "expansion"
            Item expansionItem = convertStringToQTree(query, expansion);
            if (addUnitToRewrites && expansionItem instanceof AndItem) {
                expansionItem = convertAndToPhrase((AndItem) expansionItem);
            }
            expansionGrp.addItem(expansionItem);
            if (expandIndex != null) {
                // indexName:expansion
                WordItem expansionIndexItem = new WordItem(expansion, expandIndex);
                expansionGrp.addItem(expansionIndexItem);
            }
            numRewrites++;
            RewriterUtils.log(logger, query, "Adding expansion: " + expansion);
        }
        if (!removeOriginal) {
            // (AND original)
            Item matchItem = convertStringToQTree(query, matchStr);
            if (expansionGrp.getItemIndex(matchItem) == -1) {
                expansionGrp.addItem(matchItem);
            }
        }
        parent = match.getOwner();
        int matchIndex = match.getStartIndex();
        if (parent != null) {
            // Remove matching phrase from original query
            for (int i = 0; i < match.getLength(); i++) {
                parent.removeItem(matchIndex);
            }
            // Adding back expansions
            parent.addItem(matchIndex, expansionGrp);
        } else {
            RewriterUtils.log(logger, query, "Single root item");
            // If there's no parent, i.e. single root item
            qTree.setRoot(expansionGrp);
            break;
        }
    }
    // Not root single item
    if (parent != null) {
        // Cleaning up the query after rewrite to remove redundant tags
        // e.g. (AND (OR (AND a b) c)) => (OR (AND a b) c)
        String cleanupError = QueryCanonicalizer.canonicalize(qTree);
        if (cleanupError != null) {
            RewriterUtils.error(logger, query, "Error canonicalizing query tree");
            throw new RuntimeException("Error canonicalizing query tree");
        }
    }
    // set type=adv
    queryModel.setType(Query.Type.ADVANCED);
    RewriterUtils.log(logger, query, "Successfully added expansions to matching phrases");
    return query;
}
Also used : Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) Phrase(com.yahoo.prelude.querytransform.PhraseMatcher.Phrase) PhraseMatcher(com.yahoo.prelude.querytransform.PhraseMatcher)

Aggregations

PhraseMatcher (com.yahoo.prelude.querytransform.PhraseMatcher)4 Phrase (com.yahoo.prelude.querytransform.PhraseMatcher.Phrase)4