Search in sources :

Example 1 with NodePath

use of org.exist.storage.NodePath in project exist by eXist-db.

the class DOMIndexer method store.

/**
 * Store the nodes.
 */
public void store() {
    // Create a wrapper element as root node
    final ElementImpl elem = new ElementImpl(ROOT_QNAME, broker.getBrokerPool().getSymbols());
    elem.setNodeId(broker.getBrokerPool().getNodeFactory().createInstance());
    elem.setOwnerDocument(targetDoc);
    elem.setChildCount(doc.getChildCount());
    elem.addNamespaceMapping(Namespaces.EXIST_NS_PREFIX, Namespaces.EXIST_NS);
    final NodePath path = new NodePath();
    path.addComponent(ROOT_QNAME);
    stack.push(elem);
    broker.storeNode(transaction, elem, path, indexSpec);
    targetDoc.appendChild((NodeHandle) elem);
    elem.setChildCount(0);
    // store the document nodes
    int top = (doc.size > 1) ? 1 : -1;
    while (top > 0) {
        store(top, path);
        top = doc.getNextSiblingFor(top);
    }
    // Close the wrapper element
    stack.pop();
    broker.endElement(elem, path, null);
    path.removeLastComponent();
}
Also used : ElementImpl(org.exist.dom.persistent.ElementImpl) NodePath(org.exist.storage.NodePath)

Example 2 with NodePath

use of org.exist.storage.NodePath in project exist by eXist-db.

the class IndexUtils method scanNode.

public static void scanNode(DBBroker broker, Txn transaction, IStoredNode node, StreamListener listener) {
    try (final INodeIterator iterator = broker.getNodeIterator(node)) {
        iterator.next();
        final NodePath path = node.getPath();
        scanNode(transaction, iterator, node, listener, path);
    } catch (final IOException ioe) {
        LOG.warn("Unable to close iterator", ioe);
    }
}
Also used : INodeIterator(org.exist.storage.dom.INodeIterator) IOException(java.io.IOException) NodePath(org.exist.storage.NodePath)

Example 3 with NodePath

use of org.exist.storage.NodePath in project exist by eXist-db.

the class LuceneMatchListener method scanMatches.

private void scanMatches(final NodeProxy p) {
    // Collect the text content of all descendants of p.
    // Remember the start offsets of the text nodes for later use.
    final NodePath path = getPath(p);
    final LuceneIndexConfig idxConf = config.getConfig(path).next();
    final TextExtractor extractor = new DefaultTextExtractor();
    extractor.configure(config, idxConf);
    final OffsetList offsets = new OffsetList();
    int level = 0;
    int textOffset = 0;
    try {
        final IEmbeddedXMLStreamReader reader = broker.getXMLStreamReader(p, false);
        while (reader.hasNext()) {
            final int ev = reader.next();
            switch(ev) {
                case XMLStreamConstants.END_ELEMENT:
                    if (--level < 0) {
                        break;
                    }
                    // call extractor.endElement unless this is the root of the current fragment
                    if (level > 0) {
                        textOffset += extractor.endElement(reader.getQName());
                    }
                    break;
                case XMLStreamConstants.START_ELEMENT:
                    // call extractor.startElement unless this is the root of the current fragment
                    if (level > 0) {
                        textOffset += extractor.startElement(reader.getQName());
                    }
                    ++level;
                    break;
                case XMLStreamConstants.CHARACTERS:
                    final NodeId nodeId = (NodeId) reader.getProperty(ExtendedXMLStreamReader.PROPERTY_NODE_ID);
                    textOffset += extractor.beforeCharacters();
                    offsets.add(textOffset, nodeId);
                    textOffset += extractor.characters(reader.getXMLText());
                    break;
            }
        }
    } catch (final IOException | XMLStreamException e) {
        LOG.warn("Problem found while serializing XML: {}", e.getMessage(), e);
    }
    // Retrieve the Analyzer for the NodeProxy that was used for
    // indexing and querying.
    Analyzer analyzer = idxConf.getAnalyzer();
    if (analyzer == null) {
        // Otherwise use system default Lucene analyzer (from conf.xml)
        // to tokenize the text and find matching query terms.
        analyzer = index.getDefaultAnalyzer();
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Analyzer: {} for path: {}", analyzer, path);
    }
    final String str = extractor.getText().toString();
    try (final Reader reader = new StringReader(str);
        final TokenStream tokenStream = analyzer.tokenStream(null, reader)) {
        tokenStream.reset();
        final MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream);
        while (stream.incrementToken()) {
            String text = stream.getAttribute(CharTermAttribute.class).toString();
            final Query query = termMap.get(text);
            if (query != null) {
                // single words which may also occur elsewhere in the document
                if (query instanceof PhraseQuery) {
                    final PhraseQuery phraseQuery = (PhraseQuery) query;
                    final Term[] terms = phraseQuery.getTerms();
                    if (text.equals(terms[0].text())) {
                        // Scan the following text and collect tokens to see
                        // if they are part of the phrase.
                        stream.mark();
                        int t = 1;
                        final List<State> stateList = new ArrayList<>(terms.length);
                        stateList.add(stream.captureState());
                        while (stream.incrementToken() && t < terms.length) {
                            text = stream.getAttribute(CharTermAttribute.class).toString();
                            if (text.equals(terms[t].text())) {
                                stateList.add(stream.captureState());
                                if (++t == terms.length) {
                                    break;
                                }
                            } else {
                                // stream.reset();
                                break;
                            }
                        }
                        if (stateList.size() == terms.length) {
                            // we indeed have a phrase match. record the offsets of its terms.
                            int lastIdx = -1;
                            for (int i = 0; i < terms.length; i++) {
                                stream.restoreState(stateList.get(i));
                                final OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class);
                                final int idx = offsets.getIndex(offsetAttr.startOffset());
                                final NodeId nodeId = offsets.ids[idx];
                                final Offset offset = nodesWithMatch.get(nodeId);
                                if (offset != null) {
                                    if (lastIdx == idx) {
                                        offset.setEndOffset(offsetAttr.endOffset() - offsets.offsets[idx]);
                                    } else {
                                        offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]);
                                    }
                                } else {
                                    nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]));
                                }
                                lastIdx = idx;
                            }
                        }
                    }
                // End of phrase handling
                } else {
                    final OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class);
                    final int idx = offsets.getIndex(offsetAttr.startOffset());
                    final NodeId nodeId = offsets.ids[idx];
                    final Offset offset = nodesWithMatch.get(nodeId);
                    if (offset != null) {
                        offset.add(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]);
                    } else {
                        nodesWithMatch.put(nodeId, new Offset(offsetAttr.startOffset() - offsets.offsets[idx], offsetAttr.endOffset() - offsets.offsets[idx]));
                    }
                }
            }
        }
    } catch (final IOException e) {
        LOG.warn("Problem found while serializing XML: {}", e.getMessage(), e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) IEmbeddedXMLStreamReader(org.exist.stax.IEmbeddedXMLStreamReader) ExtendedXMLStreamReader(org.exist.stax.ExtendedXMLStreamReader) Reader(java.io.Reader) StringReader(java.io.StringReader) IEmbeddedXMLStreamReader(org.exist.stax.IEmbeddedXMLStreamReader) Analyzer(org.apache.lucene.analysis.Analyzer) StringReader(java.io.StringReader) PhraseQuery(org.apache.lucene.search.PhraseQuery) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) NodePath(org.exist.storage.NodePath) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) XMLStreamException(javax.xml.stream.XMLStreamException) State(org.apache.lucene.util.AttributeSource.State) NodeId(org.exist.numbering.NodeId) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 4 with NodePath

use of org.exist.storage.NodePath in project exist by eXist-db.

the class Field method highlightMatches.

/**
 * Highlight matches in field content using the analyzer defined for the field.
 *
 * @param fieldName the name of the field
 * @param proxy node on which the field is defined
 * @param match the lucene match attached to the node
 * @param text the content of the field
 * @return a sequence of exist:field elements containing the field content with matches enclosed in exist:match
 * @throws XPathException in case of error
 * @throws IOException in case of a lucene error
 */
private Sequence highlightMatches(final String fieldName, final NodeProxy proxy, final LuceneMatch match, final Sequence text) throws XPathException, IOException {
    final LuceneIndexWorker index = (LuceneIndexWorker) context.getBroker().getIndexController().getWorkerByIndexId(LuceneIndex.ID);
    final Map<Object, Query> terms = index.getTerms(match.getQuery());
    final NodePath path = LuceneMatchListener.getPath(proxy);
    final LuceneConfig config = index.getLuceneConfig(context.getBroker(), proxy.getDocumentSet());
    LuceneIndexConfig idxConf = config.getConfig(path).next();
    if (idxConf == null) {
        // no lucene index: no fields to highlight
        return Sequence.EMPTY_SEQUENCE;
    }
    final Analyzer analyzer = idxConf.getAnalyzer();
    context.pushDocumentContext();
    try {
        final MemTreeBuilder builder = context.getDocumentBuilder();
        builder.startDocument();
        final InMemoryNodeSet result = new InMemoryNodeSet(text.getItemCount());
        for (final SequenceIterator si = text.iterate(); si.hasNext(); ) {
            final int nodeNr = builder.startElement(Namespaces.EXIST_NS, "field", "exist:field", null);
            final String content = si.nextItem().getStringValue();
            int currentPos = 0;
            try (final Reader reader = new StringReader(content);
                final TokenStream tokenStream = analyzer.tokenStream(fieldName, reader)) {
                tokenStream.reset();
                final MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream);
                while (stream.incrementToken()) {
                    String token = stream.getAttribute(CharTermAttribute.class).toString();
                    final Query query = terms.get(token);
                    if (query != null) {
                        if (match.getQuery() instanceof PhraseQuery) {
                            final Term[] phraseTerms = ((PhraseQuery) match.getQuery()).getTerms();
                            if (token.equals(phraseTerms[0].text())) {
                                // Scan the following text and collect tokens to see
                                // if they are part of the phrase.
                                stream.mark();
                                int t = 1;
                                OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
                                final int startOffset = offset.startOffset();
                                int endOffset = offset.endOffset();
                                while (stream.incrementToken() && t < phraseTerms.length) {
                                    token = stream.getAttribute(CharTermAttribute.class).toString();
                                    if (token.equals(phraseTerms[t].text())) {
                                        offset = stream.getAttribute(OffsetAttribute.class);
                                        endOffset = offset.endOffset();
                                        t++;
                                        if (t == phraseTerms.length) {
                                            break;
                                        }
                                    } else {
                                        break;
                                    }
                                }
                                if (t == phraseTerms.length) {
                                    if (currentPos < startOffset) {
                                        builder.characters(content.substring(currentPos, startOffset));
                                    }
                                    builder.startElement(Namespaces.EXIST_NS, "match", "exist:match", null);
                                    builder.characters(content.substring(startOffset, endOffset));
                                    builder.endElement();
                                    currentPos = endOffset;
                                }
                            }
                        // End of phrase handling
                        } else {
                            final OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
                            if (currentPos < offset.startOffset()) {
                                builder.characters(content.substring(currentPos, offset.startOffset()));
                            }
                            builder.startElement(Namespaces.EXIST_NS, "match", "exist:match", null);
                            builder.characters(content.substring(offset.startOffset(), offset.endOffset()));
                            builder.endElement();
                            currentPos = offset.endOffset();
                        }
                    }
                }
            }
            if (currentPos < content.length() - 1) {
                builder.characters(content.substring(currentPos));
            }
            builder.endElement();
            result.add(builder.getDocument().getNode(nodeNr));
        }
        return result;
    } finally {
        context.popDocumentContext();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) InMemoryNodeSet(org.exist.dom.memtree.InMemoryNodeSet) Reader(java.io.Reader) StringReader(java.io.StringReader) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) NodePath(org.exist.storage.NodePath) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MemTreeBuilder(org.exist.dom.memtree.MemTreeBuilder) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 5 with NodePath

use of org.exist.storage.NodePath in project exist by eXist-db.

the class OptimizeFieldPragma method tryRewriteToFields.

private Expression tryRewriteToFields(LocationStep locationStep, final Predicate[] preds, NodePath contextPath, Sequence contextSequence) throws XPathException {
    // without context path, we cannot rewrite the entire query
    if (contextPath != null) {
        final List<Predicate> notOptimizable = new ArrayList<>(preds.length);
        List<RangeIndexConfig> configs = getConfigurations(contextSequence);
        // walk through the predicates attached to the current location step
        // check if expression can be optimized
        final Map<Predicate, List<Expression>> predicateArgs = new IdentityHashMap<>(preds.length);
        for (final Predicate pred : preds) {
            List<Expression> args = null;
            SequenceConstructor arg0 = null;
            SequenceConstructor arg1 = null;
            if (pred.getLength() != 1) {
                // can only optimize predicates with one expression
                notOptimizable.add(pred);
                continue;
            }
            Expression innerExpr = pred.getExpression(0);
            List<LocationStep> steps = RangeQueryRewriter.getStepsToOptimize(innerExpr);
            if (steps == null) {
                notOptimizable.add(pred);
                continue;
            }
            // compute left hand path
            NodePath innerPath = RangeQueryRewriter.toNodePath(steps);
            if (innerPath == null) {
                notOptimizable.add(pred);
                continue;
            }
            NodePath path = new NodePath(contextPath);
            path.append(innerPath);
            if (path.length() > 0) {
                // find all complex range index configurations matching the full path to the predicate expression
                final List<ComplexRangeIndexConfigElement> rices = findConfigurations(path, configs);
                // config with most conditions for path comes first
                rices.sort(ComplexRangeIndexConfigElement.NUM_CONDITIONS_COMPARATOR);
                if (rices.isEmpty()) {
                    notOptimizable.add(pred);
                    continue;
                }
                // found index configuration with sub-fields
                int predIdx = -1;
                for (int i = 0; i < preds.length; i++) {
                    if (preds[i] == pred) {
                        predIdx = i;
                        break;
                    }
                }
                final Predicate[] precedingPreds = Arrays.copyOf(preds, predIdx);
                final ArrayList<Predicate> matchedPreds = new ArrayList<>();
                ComplexRangeIndexConfigElement rice = null;
                for (ComplexRangeIndexConfigElement testRice : rices) {
                    if (testRice.getNumberOfConditions() > 0) {
                        // find a config element where the conditions match preceding predicates
                        matchedPreds.clear();
                        for (Predicate precedingPred : precedingPreds) {
                            if (testRice.findCondition(precedingPred)) {
                                matchedPreds.add(precedingPred);
                            }
                        }
                        if (matchedPreds.size() == testRice.getNumberOfConditions()) {
                            // all conditions matched
                            rice = testRice;
                            // if any preceding predicates found to be part of a condition for this config
                            // had been matched to another config before, remove them as is is the correct match
                            predicateArgs.keySet().removeAll(matchedPreds);
                            // also do not re-add them after optimizing
                            notOptimizable.removeAll(matchedPreds);
                            break;
                        }
                    } else {
                        // no conditional configs for this node path, take the first one found if any
                        rice = testRice;
                    }
                }
                if (rice != null && rice.getNodePath().match(contextPath)) {
                    // check for a matching sub-path and retrieve field information
                    RangeIndexConfigField field = rice.getField(path);
                    if (field != null) {
                        if (args == null) {
                            // initialize args
                            args = new ArrayList<>(4);
                            arg0 = new SequenceConstructor(context);
                            args.add(arg0);
                            arg1 = new SequenceConstructor(context);
                            args.add(arg1);
                        }
                        // field is added to the sequence in first parameter
                        arg0.add(new LiteralValue(context, new StringValue(field.getName())));
                        // operator
                        arg1.add(new LiteralValue(context, new StringValue(RangeQueryRewriter.getOperator(innerExpr).toString())));
                        // append right hand expression as additional parameter
                        args.add(getKeyArg(innerExpr));
                        // store the collected arguments with a reference to the predicate
                        // so they can be removed if a better match is found (if the predicate happens to be
                        // one of the conditions for the following predicate
                        predicateArgs.put(pred, args);
                    } else {
                        notOptimizable.add(pred);
                        continue;
                    }
                } else {
                    notOptimizable.add(pred);
                    continue;
                }
            } else {
                notOptimizable.add(pred);
                continue;
            }
        }
        if (!predicateArgs.isEmpty()) {
            // the entire filter expression can be replaced
            // create range:field-equals function
            FieldLookup func = new FieldLookup(context, FieldLookup.signatures[0]);
            func.setFallback(locationStep);
            func.setLocation(locationStep.getLine(), locationStep.getColumn());
            if (predicateArgs.size() == 1) {
                func.setArguments(predicateArgs.entrySet().iterator().next().getValue());
            } else {
                final List<Expression> mergedArgs = new ArrayList<>(predicateArgs.size() * 4);
                final SequenceConstructor arg0 = new SequenceConstructor(context);
                mergedArgs.add(arg0);
                final SequenceConstructor arg1 = new SequenceConstructor(context);
                mergedArgs.add(arg1);
                for (final List<Expression> args : predicateArgs.values()) {
                    arg0.add(args.get(0));
                    arg1.add(args.get(1));
                    mergedArgs.addAll(args.subList(2, args.size()));
                }
                func.setArguments(mergedArgs);
            }
            Expression optimizedExpr = new InternalFunctionCall(func);
            if (!notOptimizable.isEmpty()) {
                final FilteredExpression filtered = new FilteredExpression(context, optimizedExpr);
                for (Predicate pred : notOptimizable) {
                    filtered.addPredicate(pred);
                }
                optimizedExpr = filtered;
            }
            return optimizedExpr;
        }
    }
    return null;
}
Also used : NodePath(org.exist.storage.NodePath)

Aggregations

NodePath (org.exist.storage.NodePath)14 IOException (java.io.IOException)4 Reader (java.io.Reader)2 StringReader (java.io.StringReader)2 Nullable (javax.annotation.Nullable)2 XMLStreamException (javax.xml.stream.XMLStreamException)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 Term (org.apache.lucene.index.Term)2 PhraseQuery (org.apache.lucene.search.PhraseQuery)2 Query (org.apache.lucene.search.Query)2 MemTreeBuilder (org.exist.dom.memtree.MemTreeBuilder)2 ElementImpl (org.exist.dom.persistent.ElementImpl)2 NodeId (org.exist.numbering.NodeId)2 ExtendedXMLStreamReader (org.exist.stax.ExtendedXMLStreamReader)2 ArrayDeque (java.util.ArrayDeque)1 State (org.apache.lucene.util.AttributeSource.State)1 Collection (org.exist.collections.Collection)1