use of org.apache.lucene.store.Directory in project elasticsearch by elastic.
the class IndicesQueryCacheTests method testBasics.
public void testBasics() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
w.addDocument(new Document());
DirectoryReader r = DirectoryReader.open(w);
w.close();
ShardId shard = new ShardId("index", "_na_", 0);
r = ElasticsearchDirectoryReader.wrap(r, shard);
IndexSearcher s = new IndexSearcher(r);
s.setQueryCachingPolicy(QueryCachingPolicy.ALWAYS_CACHE);
Settings settings = Settings.builder().put(IndicesQueryCache.INDICES_CACHE_QUERY_COUNT_SETTING.getKey(), 10).put(IndicesQueryCache.INDICES_QUERIES_CACHE_ALL_SEGMENTS_SETTING.getKey(), true).build();
IndicesQueryCache cache = new IndicesQueryCache(settings);
s.setQueryCache(cache);
QueryCacheStats stats = cache.getStats(shard);
assertEquals(0L, stats.getCacheSize());
assertEquals(0L, stats.getCacheCount());
assertEquals(0L, stats.getHitCount());
assertEquals(0L, stats.getMissCount());
assertEquals(1, s.count(new DummyQuery(0)));
stats = cache.getStats(shard);
assertEquals(1L, stats.getCacheSize());
assertEquals(1L, stats.getCacheCount());
assertEquals(0L, stats.getHitCount());
assertEquals(1L, stats.getMissCount());
for (int i = 1; i < 20; ++i) {
assertEquals(1, s.count(new DummyQuery(i)));
}
stats = cache.getStats(shard);
assertEquals(10L, stats.getCacheSize());
assertEquals(20L, stats.getCacheCount());
assertEquals(0L, stats.getHitCount());
assertEquals(20L, stats.getMissCount());
s.count(new DummyQuery(10));
stats = cache.getStats(shard);
assertEquals(10L, stats.getCacheSize());
assertEquals(20L, stats.getCacheCount());
assertEquals(1L, stats.getHitCount());
assertEquals(20L, stats.getMissCount());
IOUtils.close(r, dir);
// got emptied, but no changes to other metrics
stats = cache.getStats(shard);
assertEquals(0L, stats.getCacheSize());
assertEquals(20L, stats.getCacheCount());
assertEquals(1L, stats.getHitCount());
assertEquals(20L, stats.getMissCount());
cache.onClose(shard);
// forgot everything
stats = cache.getStats(shard);
assertEquals(0L, stats.getCacheSize());
assertEquals(0L, stats.getCacheCount());
assertEquals(0L, stats.getHitCount());
assertEquals(0L, stats.getMissCount());
// this triggers some assertions
cache.close();
}
use of org.apache.lucene.store.Directory in project elasticsearch by elastic.
the class XAnalyzingSuggester method build.
@Override
public void build(InputIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
Directory tempDir = getTempDir();
OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
hasPayloads = iterator.hasPayloads();
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
count = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == payloadSep) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
//Util.toDot(fst, pw, true, true);
//pw.close();
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.store.Directory in project elasticsearch by elastic.
the class CorruptionUtils method corruptFile.
/**
* Corrupts a random file at a random position
*/
public static void corruptFile(Random random, Path... files) throws IOException {
assertTrue("files must be non-empty", files.length > 0);
final Path fileToCorrupt = RandomPicks.randomFrom(random, files);
assertTrue(fileToCorrupt + " is not a file", Files.isRegularFile(fileToCorrupt));
try (Directory dir = FSDirectory.open(fileToCorrupt.toAbsolutePath().getParent())) {
long checksumBeforeCorruption;
try (IndexInput input = dir.openInput(fileToCorrupt.getFileName().toString(), IOContext.DEFAULT)) {
checksumBeforeCorruption = CodecUtil.retrieveChecksum(input);
}
try (FileChannel raf = FileChannel.open(fileToCorrupt, StandardOpenOption.READ, StandardOpenOption.WRITE)) {
// read
raf.position(random.nextInt((int) Math.min(Integer.MAX_VALUE, raf.size())));
long filePointer = raf.position();
ByteBuffer bb = ByteBuffer.wrap(new byte[1]);
raf.read(bb);
bb.flip();
// corrupt
byte oldValue = bb.get(0);
byte newValue = (byte) (oldValue + 1);
bb.put(0, newValue);
// rewrite
raf.position(filePointer);
raf.write(bb);
logger.info("Corrupting file -- flipping at position {} from {} to {} file: {}", filePointer, Integer.toHexString(oldValue), Integer.toHexString(newValue), fileToCorrupt.getFileName());
}
long checksumAfterCorruption;
long actualChecksumAfterCorruption;
try (ChecksumIndexInput input = dir.openChecksumInput(fileToCorrupt.getFileName().toString(), IOContext.DEFAULT)) {
assertThat(input.getFilePointer(), is(0L));
// one long is the checksum... 8 bytes
input.seek(input.length() - 8);
checksumAfterCorruption = input.getChecksum();
actualChecksumAfterCorruption = input.readLong();
}
// we need to add assumptions here that the checksums actually really don't match there is a small chance to get collisions
// in the checksum which is ok though....
StringBuilder msg = new StringBuilder();
msg.append("before: [").append(checksumBeforeCorruption).append("] ");
msg.append("after: [").append(checksumAfterCorruption).append("] ");
msg.append("checksum value after corruption: ").append(actualChecksumAfterCorruption).append("] ");
msg.append("file: ").append(fileToCorrupt.getFileName()).append(" length: ").append(dir.fileLength(fileToCorrupt.getFileName().toString()));
logger.info("Checksum {}", msg);
assumeTrue("Checksum collision - " + msg.toString(), // collision
checksumAfterCorruption != checksumBeforeCorruption || // checksum corrupted
actualChecksumAfterCorruption != checksumBeforeCorruption);
assertThat("no file corrupted", fileToCorrupt, notNullValue());
}
}
use of org.apache.lucene.store.Directory in project elasticsearch by elastic.
the class FreqTermsEnumTests method setUp.
@Before
@Override
public void setUp() throws Exception {
super.setUp();
referenceAll = new HashMap<>();
referenceNotDeleted = new HashMap<>();
referenceFilter = new HashMap<>();
Directory dir = newDirectory();
// use keyword analyzer we rely on the stored field holding the exact term.
IndexWriterConfig conf = newIndexWriterConfig(new KeywordAnalyzer());
if (frequently()) {
// we don't want to do any merges, so we won't expunge deletes
conf.setMergePolicy(NoMergePolicy.INSTANCE);
}
iw = new IndexWriter(dir, conf);
terms = new String[scaledRandomIntBetween(10, 300)];
for (int i = 0; i < terms.length; i++) {
terms[i] = randomAsciiOfLength(5);
}
int numberOfDocs = scaledRandomIntBetween(30, 300);
Document[] docs = new Document[numberOfDocs];
for (int i = 0; i < numberOfDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
docs[i] = doc;
for (String term : terms) {
if (randomBoolean()) {
continue;
}
int freq = randomIntBetween(1, 3);
for (int j = 0; j < freq; j++) {
doc.add(new TextField("field", term, Field.Store.YES));
}
}
}
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
Set<String> deletedIds = new HashSet<>();
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
if (randomInt(5) == 2) {
Term idTerm = new Term("id", doc.getField("id").stringValue());
deletedIds.add(idTerm.text());
iw.deleteDocuments(idTerm);
}
}
for (String term : terms) {
referenceAll.put(term, new FreqHolder());
referenceFilter.put(term, new FreqHolder());
referenceNotDeleted.put(term, new FreqHolder());
}
// now go over each doc, build the relevant references and filter
reader = DirectoryReader.open(iw);
List<BytesRef> filterTerms = new ArrayList<>();
for (int docId = 0; docId < reader.maxDoc(); docId++) {
Document doc = reader.document(docId);
addFreqs(doc, referenceAll);
if (!deletedIds.contains(doc.getField("id").stringValue())) {
addFreqs(doc, referenceNotDeleted);
if (randomBoolean()) {
filterTerms.add(new BytesRef(doc.getField("id").stringValue()));
addFreqs(doc, referenceFilter);
}
}
}
filter = new TermInSetQuery("id", filterTerms);
}
use of org.apache.lucene.store.Directory in project elasticsearch by elastic.
the class MoreLikeThisQueryTests method testSimple.
public void testSimple() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
indexWriter.commit();
Document document = new Document();
document.add(new TextField("_id", "1", Field.Store.YES));
document.add(new TextField("text", "lucene", Field.Store.YES));
indexWriter.addDocument(document);
document = new Document();
document.add(new TextField("_id", "2", Field.Store.YES));
document.add(new TextField("text", "lucene release", Field.Store.YES));
indexWriter.addDocument(document);
IndexReader reader = DirectoryReader.open(indexWriter);
IndexSearcher searcher = new IndexSearcher(reader);
MoreLikeThisQuery mltQuery = new MoreLikeThisQuery("lucene", new String[] { "text" }, Lucene.STANDARD_ANALYZER);
mltQuery.setLikeText("lucene");
mltQuery.setMinTermFrequency(1);
mltQuery.setMinDocFreq(1);
long count = searcher.count(mltQuery);
assertThat(count, equalTo(2L));
reader.close();
indexWriter.close();
}
Aggregations