use of org.exist.storage.OccurrenceList in project exist by eXist-db.
the class NGramIndexWorker method indexText.
private void indexText(final NodeId nodeId, final QName qname, final String text) {
final String[] ngram = tokenize(text);
final int len = text.length();
for (int i = 0, j = 0, cp; i < len; i += Character.charCount(cp), j++) {
cp = text.codePointAt(i);
final QNameTerm key = new QNameTerm(qname, ngram[j]);
OccurrenceList list = ngrams.get(key);
if (list == null) {
list = new OccurrenceList();
list.add(nodeId, i);
ngrams.put(key, list);
} else {
list.add(nodeId, i);
}
}
}
use of org.exist.storage.OccurrenceList in project exist by eXist-db.
the class NGramIndexWorker method dropIndex.
private void dropIndex(final ReindexMode mode) {
if (ngrams.isEmpty()) {
return;
}
final VariableByteOutputStream buf = new VariableByteOutputStream();
for (final Map.Entry<QNameTerm, OccurrenceList> entry : ngrams.entrySet()) {
final QNameTerm key = entry.getKey();
final OccurrenceList occurencesList = entry.getValue();
occurencesList.sort();
os.clear();
try (final ManagedLock<ReentrantLock> dbLock = lockManager.acquireBtreeWriteLock(index.db.getLockName())) {
final NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term);
boolean changed = false;
os.clear();
final VariableByteInput is = index.db.getAsStream(value);
if (is == null) {
continue;
}
while (is.available() > 0) {
final int storedDocId = is.readInt();
final byte nameType = is.readByte();
final int occurrences = is.readInt();
// Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
if (storedDocId != currentDoc.getDocId()) {
// data are related to another document:
// copy them to any existing data
os.writeInt(storedDocId);
os.writeByte(nameType);
os.writeInt(occurrences);
os.writeFixedInt(length);
is.copyRaw(os, length);
} else {
// data are related to our document:
if (mode == ReindexMode.REMOVE_ALL_NODES) {
// skip them
is.skipBytes(length);
} else {
// removing nodes: need to filter out the node ids to be removed
// feed the new list with the GIDs
final OccurrenceList newOccurrences = new OccurrenceList();
NodeId previous = null;
for (int m = 0; m < occurrences; m++) {
final NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromStream(previous, is);
previous = nodeId;
final int freq = is.readInt();
// in the list of removed nodes
if (!occurencesList.contains(nodeId)) {
for (int n = 0; n < freq; n++) {
newOccurrences.add(nodeId, is.readInt());
}
} else {
is.skip(freq);
}
}
// append the data from the new list
if (newOccurrences.getSize() > 0) {
// Don't forget this one
newOccurrences.sort();
os.writeInt(currentDoc.getDocId());
os.writeByte(nameType);
os.writeInt(newOccurrences.getTermCount());
// write nodeids, freq, and offsets to a `temp` buf
previous = null;
for (int m = 0; m < newOccurrences.getSize(); ) {
previous = newOccurrences.getNode(m).write(previous, buf);
final int freq = newOccurrences.getOccurrences(m);
buf.writeInt(freq);
for (int n = 0; n < freq; n++) {
buf.writeInt(newOccurrences.getOffset(m + n));
}
m += freq;
}
final byte[] bufData = buf.toByteArray();
// clear the buf for the next iteration
buf.clear();
// Write length of node IDs + frequency + offsets (bytes)
os.writeFixedInt(bufData.length);
// Write the node IDs + frequency + offset
os.write(bufData);
}
}
changed = true;
}
}
// Store new data, if relevant
if (changed) {
// Well, nothing to store : remove the existing data
if (os.data().size() == 0) {
index.db.remove(value);
} else {
if (index.db.put(value, os.data()) == BFile.UNKNOWN_ADDRESS) {
LOG.error("Could not put index data for token '{}' in '{}'", key.term, FileUtils.fileName(index.db.getFile()));
}
}
}
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for file {}", FileUtils.fileName(index.db.getFile()), e);
} catch (final IOException e) {
LOG.warn("IO error for file {}", FileUtils.fileName(index.db.getFile()), e);
} finally {
os.clear();
}
}
ngrams.clear();
}
use of org.exist.storage.OccurrenceList in project exist by eXist-db.
the class NGramIndexWorker method saveIndex.
private void saveIndex() {
if (ngrams.isEmpty()) {
return;
}
final VariableByteOutputStream buf = new VariableByteOutputStream();
for (final Map.Entry<QNameTerm, OccurrenceList> entry : ngrams.entrySet()) {
final QNameTerm key = entry.getKey();
final OccurrenceList occurences = entry.getValue();
occurences.sort();
os.clear();
os.writeInt(currentDoc.getDocId());
os.writeByte(key.qname.getNameType());
os.writeInt(occurences.getTermCount());
// write nodeids, freq, and offsets to a `temp` buf
try {
NodeId previous = null;
for (int m = 0; m < occurences.getSize(); ) {
previous = occurences.getNode(m).write(previous, buf);
final int freq = occurences.getOccurrences(m);
buf.writeInt(freq);
for (int n = 0; n < freq; n++) {
buf.writeInt(occurences.getOffset(m + n));
}
m += freq;
}
final byte[] bufData = buf.toByteArray();
// clear the buf for the next iteration
buf.clear();
// Write length of node IDs + frequency + offsets (bytes)
os.writeFixedInt(bufData.length);
// Write the node IDs + frequency + offset
os.write(bufData);
} catch (final IOException e) {
LOG.error("IOException while writing nGram index: {}", e.getMessage(), e);
}
final ByteArray data = os.data();
if (data.size() == 0) {
continue;
}
try (final ManagedLock<ReentrantLock> dbLock = lockManager.acquireBtreeWriteLock(index.db.getLockName())) {
final NGramQNameKey value = new NGramQNameKey(currentDoc.getCollection().getId(), key.qname, index.getBrokerPool().getSymbols(), key.term);
index.db.append(value, data);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for file {}", FileUtils.fileName(index.db.getFile()), e);
} catch (final IOException e) {
LOG.warn("IO error for file {}", FileUtils.fileName(index.db.getFile()), e);
} catch (final ReadOnlyException e) {
LOG.warn("Read-only error for file {}", FileUtils.fileName(index.db.getFile()), e);
} finally {
os.clear();
}
}
ngrams.clear();
}
Aggregations