use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.
the class TestSynonymGraphFilter method testPositionLengthAndType.
/**
* verify type of token and positionLengths on synonyms of different word counts.
*/
public void testPositionLengthAndType() throws Exception {
String testFile = "spider man, spiderman\n" + "usa,united states,u s a,united states of america";
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(testFile));
analyzer.close();
SynonymMap map = parser.build();
analyzer = getFlattenAnalyzer(parser, true);
BytesRef value = Util.get(map.fst, Util.toUTF32(new CharsRef("usa"), new IntsRefBuilder()));
ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
final int code = bytesReader.readVInt();
final int count = code >>> 1;
final int[] synonymsIdxs = new int[count];
for (int i = 0; i < count; i++) {
synonymsIdxs[i] = bytesReader.readVInt();
}
BytesRef scratchBytes = new BytesRef();
map.words.get(synonymsIdxs[2], scratchBytes);
int synonymLength = 1;
for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) {
if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) {
synonymLength++;
}
}
assertEquals(count, 3);
assertEquals(synonymLength, 4);
assertAnalyzesTo(analyzer, "spider man", new String[] { "spiderman", "spider", "man" }, new int[] { 0, 0, 7 }, new int[] { 10, 6, 10 }, new String[] { "SYNONYM", "word", "word" }, new int[] { 1, 0, 1 }, new int[] { 2, 1, 1 });
assertAnalyzesToPositions(analyzer, "amazing spider man", new String[] { "amazing", "spiderman", "spider", "man" }, new String[] { "word", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 1 }, new int[] { 1, 2, 1, 1 });
// System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy"))));
assertAnalyzesTo(analyzer, "the united states of america is wealthy", new String[] { "the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32 }, new int[] { 3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1 });
assertAnalyzesToPositions(analyzer, "spiderman", new String[] { "spider", "spiderman", "man" }, new String[] { "SYNONYM", "word", "SYNONYM" }, new int[] { 1, 0, 1 }, new int[] { 1, 2, 1 });
assertAnalyzesTo(analyzer, "spiderman enemies", new String[] { "spider", "spiderman", "man", "enemies" }, new int[] { 0, 0, 0, 10 }, new int[] { 9, 9, 9, 17 }, new String[] { "SYNONYM", "word", "SYNONYM", "word" }, new int[] { 1, 0, 1, 1 }, new int[] { 1, 2, 1, 1 });
assertAnalyzesTo(analyzer, "the usa is wealthy", new String[] { "the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11 }, new int[] { 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1 });
assertGraphStrings(analyzer, "the usa is wealthy", new String[] { "the usa is wealthy", "the united states is wealthy", "the u s a is wealthy", "the united states of america is wealthy", // Wrong. Here only due to "sausagization" of the multi word synonyms.
"the u states is wealthy", "the u states a is wealthy", "the u s of america is wealthy", "the u states of america is wealthy", "the united s a is wealthy", "the united states a is wealthy", "the united s of america is wealthy" });
assertAnalyzesTo(analyzer, "the united states is wealthy", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, false);
assertAnalyzesTo(analyzer, "the united states of balance", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 });
analyzer.close();
}
use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.
the class Test2BBinaryDocValues method testVariableBinary.
// indexes IndexWriter.MAX_DOCS docs with a variable binary field
public void testVariableBinary() throws Exception {
BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BVariableBinary"));
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper) dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
}
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH).setRAMBufferSizeMB(256.0).setMergeScheduler(new ConcurrentMergeScheduler()).setMergePolicy(newLogMergePolicy(false, 10)).setOpenMode(IndexWriterConfig.OpenMode.CREATE).setCodec(TestUtil.getDefaultCodec()));
Document doc = new Document();
byte[] bytes = new byte[4];
ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
BytesRef data = new BytesRef(bytes);
BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
doc.add(dvField);
for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
encoder.reset(bytes);
// 1, 2, or 3 bytes
encoder.writeVInt(i % 65535);
data.length = encoder.getPosition();
w.addDocument(doc);
if (i % 100000 == 0) {
System.out.println("indexed: " + i);
System.out.flush();
}
}
w.forceMerge(1);
w.close();
System.out.println("verifying...");
System.out.flush();
DirectoryReader r = DirectoryReader.open(dir);
int expectedValue = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
for (LeafReaderContext context : r.leaves()) {
LeafReader reader = context.reader();
BinaryDocValues dv = reader.getBinaryDocValues("dv");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, dv.nextDoc());
final BytesRef term = dv.binaryValue();
input.reset(term.bytes, term.offset, term.length);
assertEquals(expectedValue % 65535, input.readVInt());
assertTrue(input.eof());
expectedValue++;
}
}
r.close();
dir.close();
}
use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.
the class SegmentTermsEnum method printSeekState.
@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
if (currentFrame == staticFrame) {
out.println(" no prior seek");
} else {
out.println(" prior seek state:");
int ord = 0;
boolean isSeekFrame = true;
while (true) {
SegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
} else {
out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
}
if (fr.index != null) {
assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF));
throw new RuntimeException("seek state is broken");
}
BytesRef output = Util.get(fr.index, prefix);
if (output == null) {
out.println(" broken seek state: prefix is not final in index");
throw new RuntimeException("seek state is broken");
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length);
final long codeOrig = reader.readVLong();
final long code = (f.fp << BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0);
if (codeOrig != code) {
out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
throw new RuntimeException("seek state is broken");
}
}
}
if (f == currentFrame) {
break;
}
if (f.prefix == validIndexPrefix) {
isSeekFrame = false;
}
ord++;
}
}
}
use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.
the class SimpleTransLog method replay.
/** Replays ops between start and end location against the provided writer. Can run concurrently with ongoing operations. */
public void replay(NodeProcess primary, long start, long end) throws IOException {
try (Connection c = new Connection(primary.tcpPort)) {
c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
byte[] intBuffer = new byte[4];
ByteBuffer intByteBuffer = ByteBuffer.wrap(intBuffer);
ByteArrayDataInput in = new ByteArrayDataInput();
long pos = start;
while (pos < end) {
intByteBuffer.position(0);
intByteBuffer.limit(4);
readBytesFromChannel(pos, intByteBuffer);
pos += 4;
int len = ((intBuffer[0] & 0xff) << 24) | (intBuffer[1] & 0xff) << 16 | (intBuffer[2] & 0xff) << 8 | (intBuffer[3] & 0xff);
byte[] bytes = new byte[len];
readBytesFromChannel(pos, ByteBuffer.wrap(bytes));
pos += len;
in.reset(bytes);
byte op = in.readByte();
//System.out.println("xlog: replay op=" + op);
switch(op) {
case 0:
// We replay add as update:
replayAddDocument(c, primary, in);
break;
case 1:
// We replay add as update:
replayAddDocument(c, primary, in);
break;
case 2:
replayDeleteDocuments(c, primary, in);
break;
default:
throw new CorruptIndexException("invalid operation " + op, in);
}
}
assert pos == end;
//System.out.println("xlog: done replay");
c.out.writeByte(SimplePrimaryNode.CMD_INDEXING_DONE);
c.flush();
//System.out.println("xlog: done flush");
c.in.readByte();
//System.out.println("xlog: done readByte");
}
}
use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.
the class AbstractTestCompressionMode method decompress.
static byte[] decompress(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException {
final BytesRef bytes = new BytesRef();
decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes);
return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
}
Aggregations