Search in sources :

Example 1 with OccurrenceList

use of org.exist.storage.OccurrenceList in project exist by eXist-db.

the class NGramIndexWorker method indexText.

private void indexText(final NodeId nodeId, final QName qname, final String text) {
    final String[] ngram = tokenize(text);
    final int len = text.length();
    for (int i = 0, j = 0, cp; i < len; i += Character.charCount(cp), j++) {
        cp = text.codePointAt(i);
        final QNameTerm key = new QNameTerm(qname, ngram[j]);
        OccurrenceList list = ngrams.get(key);
        if (list == null) {
            list = new OccurrenceList();
            list.add(nodeId, i);
            ngrams.put(key, list);
        } else {
            list.add(nodeId, i);
        }
    }
}
Also used : OccurrenceList(org.exist.storage.OccurrenceList)

Example 2 with OccurrenceList

use of org.exist.storage.OccurrenceList in project exist by eXist-db.

the class NGramIndexWorker method dropIndex.

private void dropIndex(final ReindexMode mode) {
    if (ngrams.isEmpty()) {
        return;
    }
    final VariableByteOutputStream buf = new VariableByteOutputStream();
    for (final Map.Entry<QNameTerm, OccurrenceList> entry : ngrams.entrySet()) {
        final QNameTerm key = entry.getKey();
        final OccurrenceList occurencesList = entry.getValue();
        occurencesList.sort();
        os.clear();
        try (final ManagedLock<ReentrantLock> dbLock = lockManager.acquireBtreeWriteLock(index.db.getLockName())) {
            final NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term);
            boolean changed = false;
            os.clear();
            final VariableByteInput is = index.db.getAsStream(value);
            if (is == null) {
                continue;
            }
            while (is.available() > 0) {
                final int storedDocId = is.readInt();
                final byte nameType = is.readByte();
                final int occurrences = is.readInt();
                // Read (variable) length of node IDs + frequency + offsets
                final int length = is.readFixedInt();
                if (storedDocId != currentDoc.getDocId()) {
                    // data are related to another document:
                    // copy them to any existing data
                    os.writeInt(storedDocId);
                    os.writeByte(nameType);
                    os.writeInt(occurrences);
                    os.writeFixedInt(length);
                    is.copyRaw(os, length);
                } else {
                    // data are related to our document:
                    if (mode == ReindexMode.REMOVE_ALL_NODES) {
                        // skip them
                        is.skipBytes(length);
                    } else {
                        // removing nodes: need to filter out the node ids to be removed
                        // feed the new list with the GIDs
                        final OccurrenceList newOccurrences = new OccurrenceList();
                        NodeId previous = null;
                        for (int m = 0; m < occurrences; m++) {
                            final NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromStream(previous, is);
                            previous = nodeId;
                            final int freq = is.readInt();
                            // in the list of removed nodes
                            if (!occurencesList.contains(nodeId)) {
                                for (int n = 0; n < freq; n++) {
                                    newOccurrences.add(nodeId, is.readInt());
                                }
                            } else {
                                is.skip(freq);
                            }
                        }
                        // append the data from the new list
                        if (newOccurrences.getSize() > 0) {
                            // Don't forget this one
                            newOccurrences.sort();
                            os.writeInt(currentDoc.getDocId());
                            os.writeByte(nameType);
                            os.writeInt(newOccurrences.getTermCount());
                            // write nodeids, freq, and offsets to a `temp` buf
                            previous = null;
                            for (int m = 0; m < newOccurrences.getSize(); ) {
                                previous = newOccurrences.getNode(m).write(previous, buf);
                                final int freq = newOccurrences.getOccurrences(m);
                                buf.writeInt(freq);
                                for (int n = 0; n < freq; n++) {
                                    buf.writeInt(newOccurrences.getOffset(m + n));
                                }
                                m += freq;
                            }
                            final byte[] bufData = buf.toByteArray();
                            // clear the buf for the next iteration
                            buf.clear();
                            // Write length of node IDs + frequency + offsets (bytes)
                            os.writeFixedInt(bufData.length);
                            // Write the node IDs + frequency + offset
                            os.write(bufData);
                        }
                    }
                    changed = true;
                }
            }
            // Store new data, if relevant
            if (changed) {
                // Well, nothing to store : remove the existing data
                if (os.data().size() == 0) {
                    index.db.remove(value);
                } else {
                    if (index.db.put(value, os.data()) == BFile.UNKNOWN_ADDRESS) {
                        LOG.error("Could not put index data for token '{}' in '{}'", key.term, FileUtils.fileName(index.db.getFile()));
                    }
                }
            }
        } catch (final LockException e) {
            LOG.warn("Failed to acquire lock for file {}", FileUtils.fileName(index.db.getFile()), e);
        } catch (final IOException e) {
            LOG.warn("IO error for file {}", FileUtils.fileName(index.db.getFile()), e);
        } finally {
            os.clear();
        }
    }
    ngrams.clear();
}
Also used : ReentrantLock(java.util.concurrent.locks.ReentrantLock) OccurrenceList(org.exist.storage.OccurrenceList) IOException(java.io.IOException) VariableByteInput(org.exist.storage.io.VariableByteInput) VariableByteOutputStream(org.exist.storage.io.VariableByteOutputStream) NodeId(org.exist.numbering.NodeId)

Example 3 with OccurrenceList

use of org.exist.storage.OccurrenceList in project exist by eXist-db.

the class NGramIndexWorker method saveIndex.

private void saveIndex() {
    if (ngrams.isEmpty()) {
        return;
    }
    final VariableByteOutputStream buf = new VariableByteOutputStream();
    for (final Map.Entry<QNameTerm, OccurrenceList> entry : ngrams.entrySet()) {
        final QNameTerm key = entry.getKey();
        final OccurrenceList occurences = entry.getValue();
        occurences.sort();
        os.clear();
        os.writeInt(currentDoc.getDocId());
        os.writeByte(key.qname.getNameType());
        os.writeInt(occurences.getTermCount());
        // write nodeids, freq, and offsets to a `temp` buf
        try {
            NodeId previous = null;
            for (int m = 0; m < occurences.getSize(); ) {
                previous = occurences.getNode(m).write(previous, buf);
                final int freq = occurences.getOccurrences(m);
                buf.writeInt(freq);
                for (int n = 0; n < freq; n++) {
                    buf.writeInt(occurences.getOffset(m + n));
                }
                m += freq;
            }
            final byte[] bufData = buf.toByteArray();
            // clear the buf for the next iteration
            buf.clear();
            // Write length of node IDs + frequency + offsets (bytes)
            os.writeFixedInt(bufData.length);
            // Write the node IDs + frequency + offset
            os.write(bufData);
        } catch (final IOException e) {
            LOG.error("IOException while writing nGram index: {}", e.getMessage(), e);
        }
        final ByteArray data = os.data();
        if (data.size() == 0) {
            continue;
        }
        try (final ManagedLock<ReentrantLock> dbLock = lockManager.acquireBtreeWriteLock(index.db.getLockName())) {
            final NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term);
            index.db.append(value, data);
        } catch (final LockException e) {
            LOG.warn("Failed to acquire lock for file {}", FileUtils.fileName(index.db.getFile()), e);
        } catch (final IOException e) {
            LOG.warn("IO error for file {}", FileUtils.fileName(index.db.getFile()), e);
        } catch (final ReadOnlyException e) {
            LOG.warn("Read-only error for file {}", FileUtils.fileName(index.db.getFile()), e);
        } finally {
            os.clear();
        }
    }
    ngrams.clear();
}
Also used : ReentrantLock(java.util.concurrent.locks.ReentrantLock) OccurrenceList(org.exist.storage.OccurrenceList) IOException(java.io.IOException) VariableByteOutputStream(org.exist.storage.io.VariableByteOutputStream) NodeId(org.exist.numbering.NodeId)

Aggregations

OccurrenceList (org.exist.storage.OccurrenceList)3 IOException (java.io.IOException)2 ReentrantLock (java.util.concurrent.locks.ReentrantLock)2 NodeId (org.exist.numbering.NodeId)2 VariableByteOutputStream (org.exist.storage.io.VariableByteOutputStream)2 VariableByteInput (org.exist.storage.io.VariableByteInput)1