Search in sources :

Example 51 with Directory

use of org.apache.lucene.store.Directory in project elasticsearch by elastic.

the class IndicesQueryCacheTests method testBasics.

public void testBasics() throws IOException {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
    w.addDocument(new Document());
    DirectoryReader r = DirectoryReader.open(w);
    w.close();
    ShardId shard = new ShardId("index", "_na_", 0);
    r = ElasticsearchDirectoryReader.wrap(r, shard);
    IndexSearcher s = new IndexSearcher(r);
    s.setQueryCachingPolicy(QueryCachingPolicy.ALWAYS_CACHE);
    Settings settings = Settings.builder().put(IndicesQueryCache.INDICES_CACHE_QUERY_COUNT_SETTING.getKey(), 10).put(IndicesQueryCache.INDICES_QUERIES_CACHE_ALL_SEGMENTS_SETTING.getKey(), true).build();
    IndicesQueryCache cache = new IndicesQueryCache(settings);
    s.setQueryCache(cache);
    QueryCacheStats stats = cache.getStats(shard);
    assertEquals(0L, stats.getCacheSize());
    assertEquals(0L, stats.getCacheCount());
    assertEquals(0L, stats.getHitCount());
    assertEquals(0L, stats.getMissCount());
    assertEquals(1, s.count(new DummyQuery(0)));
    stats = cache.getStats(shard);
    assertEquals(1L, stats.getCacheSize());
    assertEquals(1L, stats.getCacheCount());
    assertEquals(0L, stats.getHitCount());
    assertEquals(1L, stats.getMissCount());
    for (int i = 1; i < 20; ++i) {
        assertEquals(1, s.count(new DummyQuery(i)));
    }
    stats = cache.getStats(shard);
    assertEquals(10L, stats.getCacheSize());
    assertEquals(20L, stats.getCacheCount());
    assertEquals(0L, stats.getHitCount());
    assertEquals(20L, stats.getMissCount());
    s.count(new DummyQuery(10));
    stats = cache.getStats(shard);
    assertEquals(10L, stats.getCacheSize());
    assertEquals(20L, stats.getCacheCount());
    assertEquals(1L, stats.getHitCount());
    assertEquals(20L, stats.getMissCount());
    IOUtils.close(r, dir);
    // got emptied, but no changes to other metrics
    stats = cache.getStats(shard);
    assertEquals(0L, stats.getCacheSize());
    assertEquals(20L, stats.getCacheCount());
    assertEquals(1L, stats.getHitCount());
    assertEquals(20L, stats.getMissCount());
    cache.onClose(shard);
    // forgot everything
    stats = cache.getStats(shard);
    assertEquals(0L, stats.getCacheSize());
    assertEquals(0L, stats.getCacheCount());
    assertEquals(0L, stats.getHitCount());
    assertEquals(0L, stats.getMissCount());
    // this triggers some assertions
    cache.close();
}
Also used : ShardId(org.elasticsearch.index.shard.ShardId) IndexSearcher(org.apache.lucene.search.IndexSearcher) IndicesQueryCache(org.elasticsearch.indices.IndicesQueryCache) IndexWriter(org.apache.lucene.index.IndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) ElasticsearchDirectoryReader(org.elasticsearch.common.lucene.index.ElasticsearchDirectoryReader) QueryCacheStats(org.elasticsearch.index.cache.query.QueryCacheStats) Document(org.apache.lucene.document.Document) Settings(org.elasticsearch.common.settings.Settings) Directory(org.apache.lucene.store.Directory)

Example 52 with Directory

use of org.apache.lucene.store.Directory in project elasticsearch by elastic.

the class XAnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Directory tempDir = getTempDir();
    OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    hasPayloads = iterator.hasPayloads();
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == payloadSep) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //PrintWriter pw = new PrintWriter("/tmp/out.dot");
    //Util.toDot(fst, pw, true, true);
    //pw.close();
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 53 with Directory

use of org.apache.lucene.store.Directory in project elasticsearch by elastic.

the class CorruptionUtils method corruptFile.

/**
     * Corrupts a random file at a random position
     */
public static void corruptFile(Random random, Path... files) throws IOException {
    assertTrue("files must be non-empty", files.length > 0);
    final Path fileToCorrupt = RandomPicks.randomFrom(random, files);
    assertTrue(fileToCorrupt + " is not a file", Files.isRegularFile(fileToCorrupt));
    try (Directory dir = FSDirectory.open(fileToCorrupt.toAbsolutePath().getParent())) {
        long checksumBeforeCorruption;
        try (IndexInput input = dir.openInput(fileToCorrupt.getFileName().toString(), IOContext.DEFAULT)) {
            checksumBeforeCorruption = CodecUtil.retrieveChecksum(input);
        }
        try (FileChannel raf = FileChannel.open(fileToCorrupt, StandardOpenOption.READ, StandardOpenOption.WRITE)) {
            // read
            raf.position(random.nextInt((int) Math.min(Integer.MAX_VALUE, raf.size())));
            long filePointer = raf.position();
            ByteBuffer bb = ByteBuffer.wrap(new byte[1]);
            raf.read(bb);
            bb.flip();
            // corrupt
            byte oldValue = bb.get(0);
            byte newValue = (byte) (oldValue + 1);
            bb.put(0, newValue);
            // rewrite
            raf.position(filePointer);
            raf.write(bb);
            logger.info("Corrupting file --  flipping at position {} from {} to {} file: {}", filePointer, Integer.toHexString(oldValue), Integer.toHexString(newValue), fileToCorrupt.getFileName());
        }
        long checksumAfterCorruption;
        long actualChecksumAfterCorruption;
        try (ChecksumIndexInput input = dir.openChecksumInput(fileToCorrupt.getFileName().toString(), IOContext.DEFAULT)) {
            assertThat(input.getFilePointer(), is(0L));
            // one long is the checksum... 8 bytes
            input.seek(input.length() - 8);
            checksumAfterCorruption = input.getChecksum();
            actualChecksumAfterCorruption = input.readLong();
        }
        // we need to add assumptions here that the checksums actually really don't match there is a small chance to get collisions
        // in the checksum which is ok though....
        StringBuilder msg = new StringBuilder();
        msg.append("before: [").append(checksumBeforeCorruption).append("] ");
        msg.append("after: [").append(checksumAfterCorruption).append("] ");
        msg.append("checksum value after corruption: ").append(actualChecksumAfterCorruption).append("] ");
        msg.append("file: ").append(fileToCorrupt.getFileName()).append(" length: ").append(dir.fileLength(fileToCorrupt.getFileName().toString()));
        logger.info("Checksum {}", msg);
        assumeTrue("Checksum collision - " + msg.toString(), // collision
        checksumAfterCorruption != checksumBeforeCorruption || // checksum corrupted
        actualChecksumAfterCorruption != checksumBeforeCorruption);
        assertThat("no file corrupted", fileToCorrupt, notNullValue());
    }
}
Also used : Path(java.nio.file.Path) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) FileChannel(java.nio.channels.FileChannel) IndexInput(org.apache.lucene.store.IndexInput) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) ByteBuffer(java.nio.ByteBuffer) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 54 with Directory

use of org.apache.lucene.store.Directory in project elasticsearch by elastic.

the class FreqTermsEnumTests method setUp.

@Before
@Override
public void setUp() throws Exception {
    super.setUp();
    referenceAll = new HashMap<>();
    referenceNotDeleted = new HashMap<>();
    referenceFilter = new HashMap<>();
    Directory dir = newDirectory();
    // use keyword analyzer we rely on the stored field holding the exact term.
    IndexWriterConfig conf = newIndexWriterConfig(new KeywordAnalyzer());
    if (frequently()) {
        // we don't want to do any merges, so we won't expunge deletes
        conf.setMergePolicy(NoMergePolicy.INSTANCE);
    }
    iw = new IndexWriter(dir, conf);
    terms = new String[scaledRandomIntBetween(10, 300)];
    for (int i = 0; i < terms.length; i++) {
        terms[i] = randomAsciiOfLength(5);
    }
    int numberOfDocs = scaledRandomIntBetween(30, 300);
    Document[] docs = new Document[numberOfDocs];
    for (int i = 0; i < numberOfDocs; i++) {
        Document doc = new Document();
        doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
        docs[i] = doc;
        for (String term : terms) {
            if (randomBoolean()) {
                continue;
            }
            int freq = randomIntBetween(1, 3);
            for (int j = 0; j < freq; j++) {
                doc.add(new TextField("field", term, Field.Store.YES));
            }
        }
    }
    for (int i = 0; i < docs.length; i++) {
        Document doc = docs[i];
        iw.addDocument(doc);
        if (rarely()) {
            iw.commit();
        }
    }
    Set<String> deletedIds = new HashSet<>();
    for (int i = 0; i < docs.length; i++) {
        Document doc = docs[i];
        if (randomInt(5) == 2) {
            Term idTerm = new Term("id", doc.getField("id").stringValue());
            deletedIds.add(idTerm.text());
            iw.deleteDocuments(idTerm);
        }
    }
    for (String term : terms) {
        referenceAll.put(term, new FreqHolder());
        referenceFilter.put(term, new FreqHolder());
        referenceNotDeleted.put(term, new FreqHolder());
    }
    // now go over each doc, build the relevant references and filter
    reader = DirectoryReader.open(iw);
    List<BytesRef> filterTerms = new ArrayList<>();
    for (int docId = 0; docId < reader.maxDoc(); docId++) {
        Document doc = reader.document(docId);
        addFreqs(doc, referenceAll);
        if (!deletedIds.contains(doc.getField("id").stringValue())) {
            addFreqs(doc, referenceNotDeleted);
            if (randomBoolean()) {
                filterTerms.add(new BytesRef(doc.getField("id").stringValue()));
                addFreqs(doc, referenceFilter);
            }
        }
    }
    filter = new TermInSetQuery("id", filterTerms);
}
Also used : KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IndexWriter(org.apache.lucene.index.IndexWriter) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) HashSet(java.util.HashSet) Before(org.junit.Before)

Example 55 with Directory

use of org.apache.lucene.store.Directory in project elasticsearch by elastic.

the class MoreLikeThisQueryTests method testSimple.

public void testSimple() throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
    indexWriter.commit();
    Document document = new Document();
    document.add(new TextField("_id", "1", Field.Store.YES));
    document.add(new TextField("text", "lucene", Field.Store.YES));
    indexWriter.addDocument(document);
    document = new Document();
    document.add(new TextField("_id", "2", Field.Store.YES));
    document.add(new TextField("text", "lucene release", Field.Store.YES));
    indexWriter.addDocument(document);
    IndexReader reader = DirectoryReader.open(indexWriter);
    IndexSearcher searcher = new IndexSearcher(reader);
    MoreLikeThisQuery mltQuery = new MoreLikeThisQuery("lucene", new String[] { "text" }, Lucene.STANDARD_ANALYZER);
    mltQuery.setLikeText("lucene");
    mltQuery.setMinTermFrequency(1);
    mltQuery.setMinDocFreq(1);
    long count = searcher.count(mltQuery);
    assertThat(count, equalTo(2L));
    reader.close();
    indexWriter.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) MoreLikeThisQuery(org.elasticsearch.common.lucene.search.MoreLikeThisQuery) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

Directory (org.apache.lucene.store.Directory)2194 Document (org.apache.lucene.document.Document)1374 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)816 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)669 IndexReader (org.apache.lucene.index.IndexReader)590 IndexSearcher (org.apache.lucene.search.IndexSearcher)383 BytesRef (org.apache.lucene.util.BytesRef)376 RAMDirectory (org.apache.lucene.store.RAMDirectory)360 Term (org.apache.lucene.index.Term)325 StringField (org.apache.lucene.document.StringField)313 IndexWriter (org.apache.lucene.index.IndexWriter)312 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)274 TextField (org.apache.lucene.document.TextField)259 Field (org.apache.lucene.document.Field)257 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)257 Test (org.junit.Test)237 FSDirectory (org.apache.lucene.store.FSDirectory)221 Analyzer (org.apache.lucene.analysis.Analyzer)193 DirectoryReader (org.apache.lucene.index.DirectoryReader)193 TopDocs (org.apache.lucene.search.TopDocs)174