Search in sources :

Example 21 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class TestSynonymGraphFilter method testPositionLengthAndType.

/**
   * verify type of token and positionLengths on synonyms of different word counts.
   */
public void testPositionLengthAndType() throws Exception {
    String testFile = "spider man, spiderman\n" + "usa,united states,u s a,united states of america";
    Analyzer analyzer = new MockAnalyzer(random());
    SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
    parser.parse(new StringReader(testFile));
    analyzer.close();
    SynonymMap map = parser.build();
    analyzer = getFlattenAnalyzer(parser, true);
    BytesRef value = Util.get(map.fst, Util.toUTF32(new CharsRef("usa"), new IntsRefBuilder()));
    ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
    final int code = bytesReader.readVInt();
    final int count = code >>> 1;
    final int[] synonymsIdxs = new int[count];
    for (int i = 0; i < count; i++) {
        synonymsIdxs[i] = bytesReader.readVInt();
    }
    BytesRef scratchBytes = new BytesRef();
    map.words.get(synonymsIdxs[2], scratchBytes);
    int synonymLength = 1;
    for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) {
        if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) {
            synonymLength++;
        }
    }
    assertEquals(count, 3);
    assertEquals(synonymLength, 4);
    assertAnalyzesTo(analyzer, "spider man", new String[] { "spiderman", "spider", "man" }, new int[] { 0, 0, 7 }, new int[] { 10, 6, 10 }, new String[] { "SYNONYM", "word", "word" }, new int[] { 1, 0, 1 }, new int[] { 2, 1, 1 });
    assertAnalyzesToPositions(analyzer, "amazing spider man", new String[] { "amazing", "spiderman", "spider", "man" }, new String[] { "word", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 1 }, new int[] { 1, 2, 1, 1 });
    // System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy"))));
    assertAnalyzesTo(analyzer, "the united states of america is wealthy", new String[] { "the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32 }, new int[] { 3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1 });
    assertAnalyzesToPositions(analyzer, "spiderman", new String[] { "spider", "spiderman", "man" }, new String[] { "SYNONYM", "word", "SYNONYM" }, new int[] { 1, 0, 1 }, new int[] { 1, 2, 1 });
    assertAnalyzesTo(analyzer, "spiderman enemies", new String[] { "spider", "spiderman", "man", "enemies" }, new int[] { 0, 0, 0, 10 }, new int[] { 9, 9, 9, 17 }, new String[] { "SYNONYM", "word", "SYNONYM", "word" }, new int[] { 1, 0, 1, 1 }, new int[] { 1, 2, 1, 1 });
    assertAnalyzesTo(analyzer, "the usa is wealthy", new String[] { "the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11 }, new int[] { 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1 });
    assertGraphStrings(analyzer, "the usa is wealthy", new String[] { "the usa is wealthy", "the united states is wealthy", "the u s a is wealthy", "the united states of america is wealthy", // Wrong. Here only due to "sausagization" of the multi word synonyms.
    "the u states is wealthy", "the u states a is wealthy", "the u s of america is wealthy", "the u states of america is wealthy", "the united s a is wealthy", "the united states a is wealthy", "the united s of america is wealthy" });
    assertAnalyzesTo(analyzer, "the united states is wealthy", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, false);
    assertAnalyzesTo(analyzer, "the united states of balance", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 });
    analyzer.close();
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesRef(org.apache.lucene.util.BytesRef) CharsRef(org.apache.lucene.util.CharsRef)

Example 22 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class Test2BBinaryDocValues method testVariableBinary.

// indexes IndexWriter.MAX_DOCS docs with a variable binary field
public void testVariableBinary() throws Exception {
    BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BVariableBinary"));
    if (dir instanceof MockDirectoryWrapper) {
        ((MockDirectoryWrapper) dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
    }
    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH).setRAMBufferSizeMB(256.0).setMergeScheduler(new ConcurrentMergeScheduler()).setMergePolicy(newLogMergePolicy(false, 10)).setOpenMode(IndexWriterConfig.OpenMode.CREATE).setCodec(TestUtil.getDefaultCodec()));
    Document doc = new Document();
    byte[] bytes = new byte[4];
    ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
    BytesRef data = new BytesRef(bytes);
    BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
    doc.add(dvField);
    for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
        encoder.reset(bytes);
        // 1, 2, or 3 bytes
        encoder.writeVInt(i % 65535);
        data.length = encoder.getPosition();
        w.addDocument(doc);
        if (i % 100000 == 0) {
            System.out.println("indexed: " + i);
            System.out.flush();
        }
    }
    w.forceMerge(1);
    w.close();
    System.out.println("verifying...");
    System.out.flush();
    DirectoryReader r = DirectoryReader.open(dir);
    int expectedValue = 0;
    ByteArrayDataInput input = new ByteArrayDataInput();
    for (LeafReaderContext context : r.leaves()) {
        LeafReader reader = context.reader();
        BinaryDocValues dv = reader.getBinaryDocValues("dv");
        for (int i = 0; i < reader.maxDoc(); i++) {
            assertEquals(i, dv.nextDoc());
            final BytesRef term = dv.binaryValue();
            input.reset(term.bytes, term.offset, term.length);
            assertEquals(expectedValue % 65535, input.readVInt());
            assertTrue(input.eof());
            expectedValue++;
        }
    }
    r.close();
    dir.close();
}
Also used : MockDirectoryWrapper(org.apache.lucene.store.MockDirectoryWrapper) Document(org.apache.lucene.document.Document) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) BaseDirectoryWrapper(org.apache.lucene.store.BaseDirectoryWrapper) BytesRef(org.apache.lucene.util.BytesRef)

Example 23 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class SegmentTermsEnum method printSeekState.

@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
    if (currentFrame == staticFrame) {
        out.println("  no prior seek");
    } else {
        out.println("  prior seek state:");
        int ord = 0;
        boolean isSeekFrame = true;
        while (true) {
            SegmentTermsEnumFrame f = getFrame(ord);
            assert f != null;
            final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
            if (f.nextEnt == -1) {
                out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
            } else {
                out.println("    frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
            }
            if (fr.index != null) {
                assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
                if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
                    out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF));
                    throw new RuntimeException("seek state is broken");
                }
                BytesRef output = Util.get(fr.index, prefix);
                if (output == null) {
                    out.println("      broken seek state: prefix is not final in index");
                    throw new RuntimeException("seek state is broken");
                } else if (isSeekFrame && !f.isFloor) {
                    final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length);
                    final long codeOrig = reader.readVLong();
                    final long code = (f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0);
                    if (codeOrig != code) {
                        out.println("      broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
                        throw new RuntimeException("seek state is broken");
                    }
                }
            }
            if (f == currentFrame) {
                break;
            }
            if (f.prefix == validIndexPrefix) {
                isSeekFrame = false;
            }
            ord++;
        }
    }
}
Also used : ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesRef(org.apache.lucene.util.BytesRef)

Example 24 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class SimpleTransLog method replay.

/** Replays ops between start and end location against the provided writer.  Can run concurrently with ongoing operations. */
public void replay(NodeProcess primary, long start, long end) throws IOException {
    try (Connection c = new Connection(primary.tcpPort)) {
        c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
        byte[] intBuffer = new byte[4];
        ByteBuffer intByteBuffer = ByteBuffer.wrap(intBuffer);
        ByteArrayDataInput in = new ByteArrayDataInput();
        long pos = start;
        while (pos < end) {
            intByteBuffer.position(0);
            intByteBuffer.limit(4);
            readBytesFromChannel(pos, intByteBuffer);
            pos += 4;
            int len = ((intBuffer[0] & 0xff) << 24) | (intBuffer[1] & 0xff) << 16 | (intBuffer[2] & 0xff) << 8 | (intBuffer[3] & 0xff);
            byte[] bytes = new byte[len];
            readBytesFromChannel(pos, ByteBuffer.wrap(bytes));
            pos += len;
            in.reset(bytes);
            byte op = in.readByte();
            //System.out.println("xlog: replay op=" + op);
            switch(op) {
                case 0:
                    // We replay add as update:
                    replayAddDocument(c, primary, in);
                    break;
                case 1:
                    // We replay add as update:
                    replayAddDocument(c, primary, in);
                    break;
                case 2:
                    replayDeleteDocuments(c, primary, in);
                    break;
                default:
                    throw new CorruptIndexException("invalid operation " + op, in);
            }
        }
        assert pos == end;
        //System.out.println("xlog: done replay");
        c.out.writeByte(SimplePrimaryNode.CMD_INDEXING_DONE);
        c.flush();
        //System.out.println("xlog: done flush");
        c.in.readByte();
    //System.out.println("xlog: done readByte");
    }
}
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteBuffer(java.nio.ByteBuffer)

Example 25 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class AbstractTestCompressionMode method decompress.

static byte[] decompress(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException {
    final BytesRef bytes = new BytesRef();
    decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes);
    return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
}
Also used : ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)26 BytesRef (org.apache.lucene.util.BytesRef)16 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)5 IndexOutput (org.apache.lucene.store.IndexOutput)4 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)4 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 Random (java.util.Random)3 IntPoint (org.apache.lucene.document.IntPoint)3 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)3 IntsRef (org.apache.lucene.util.IntsRef)3 OfflineSorter (org.apache.lucene.util.OfflineSorter)3 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)3 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 Directory (org.apache.lucene.store.Directory)2 IndexInput (org.apache.lucene.store.IndexInput)2 CharsRef (org.apache.lucene.util.CharsRef)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2