use of org.apache.lucene.document.BinaryDocValuesField in project lucene-solr by apache.
the class TestFieldCacheVsDocValues method testHugeBinaryValueLimit.
// TODO: get this out of here and into the deprecated codecs (4.0, 4.2)
public void testHugeBinaryValueLimit() throws Exception {
// We only test DVFormats that have a limit
assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
Analyzer analyzer = new MockAnalyzer(random());
// FSDirectory because SimpleText will consume gobbs of
// space when storing big binary values:
Directory d = newFSDirectory(createTempDir("hugeBinaryValues"));
boolean doFixed = random().nextBoolean();
int numDocs;
int fixedLength = 0;
if (doFixed) {
// Sometimes make all values fixed length since some
// codecs have different code paths for this:
numDocs = TestUtil.nextInt(random(), 10, 20);
fixedLength = LARGE_BINARY_FIELD_LENGTH;
} else {
numDocs = TestUtil.nextInt(random(), 100, 200);
}
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer));
List<byte[]> docBytes = new ArrayList<>();
long totalBytes = 0;
for (int docID = 0; docID < numDocs; docID++) {
// we don't use RandomIndexWriter because it might add
// more docvalues than we expect !!!!
// Must be > 64KB in size to ensure more than 2 pages in
// PagedBytes would be needed:
int numBytes;
if (doFixed) {
numBytes = fixedLength;
} else if (docID == 0 || random().nextInt(5) == 3) {
numBytes = LARGE_BINARY_FIELD_LENGTH;
} else {
numBytes = TestUtil.nextInt(random(), 1, LARGE_BINARY_FIELD_LENGTH);
}
totalBytes += numBytes;
if (totalBytes > 5 * 1024 * 1024) {
break;
}
byte[] bytes = new byte[numBytes];
random().nextBytes(bytes);
docBytes.add(bytes);
Document doc = new Document();
BytesRef b = new BytesRef(bytes);
b.length = bytes.length;
doc.add(new BinaryDocValuesField("field", b));
doc.add(new StringField("id", "" + docID, Field.Store.YES));
w.addDocument(doc);
}
DirectoryReader r = DirectoryReader.open(w);
w.close();
LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
for (int docID = 0; docID < docBytes.size(); docID++) {
assertEquals(docID, s.nextDoc());
Document doc = ar.document(docID);
BytesRef bytes = s.binaryValue();
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
assertEquals(expected.length, bytes.length);
assertEquals(new BytesRef(expected), bytes);
}
ar.close();
d.close();
}
use of org.apache.lucene.document.BinaryDocValuesField in project lucene-solr by apache.
the class TestFieldCacheVsDocValues method testHugeBinaryValues.
// LUCENE-4853
public void testHugeBinaryValues() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// FSDirectory because SimpleText will consume gobbs of
// space when storing big binary values:
Directory d = newFSDirectory(createTempDir("hugeBinaryValues"));
boolean doFixed = random().nextBoolean();
int numDocs;
int fixedLength = 0;
if (doFixed) {
// Sometimes make all values fixed length since some
// codecs have different code paths for this:
numDocs = TestUtil.nextInt(random(), 10, 20);
fixedLength = TestUtil.nextInt(random(), 65537, 256 * 1024);
} else {
numDocs = TestUtil.nextInt(random(), 100, 200);
}
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer));
List<byte[]> docBytes = new ArrayList<>();
long totalBytes = 0;
for (int docID = 0; docID < numDocs; docID++) {
// we don't use RandomIndexWriter because it might add
// more docvalues than we expect !!!!
// Must be > 64KB in size to ensure more than 2 pages in
// PagedBytes would be needed:
int numBytes;
if (doFixed) {
numBytes = fixedLength;
} else if (docID == 0 || random().nextInt(5) == 3) {
numBytes = TestUtil.nextInt(random(), 65537, 3 * 1024 * 1024);
} else {
numBytes = TestUtil.nextInt(random(), 1, 1024 * 1024);
}
totalBytes += numBytes;
if (totalBytes > 5 * 1024 * 1024) {
break;
}
byte[] bytes = new byte[numBytes];
random().nextBytes(bytes);
docBytes.add(bytes);
Document doc = new Document();
BytesRef b = new BytesRef(bytes);
b.length = bytes.length;
doc.add(new BinaryDocValuesField("field", b));
doc.add(new StringField("id", "" + docID, Field.Store.YES));
try {
w.addDocument(doc);
} catch (IllegalArgumentException iae) {
if (iae.getMessage().indexOf("is too large") == -1) {
throw iae;
} else {
// OK: some codecs can't handle binary DV > 32K
assertFalse(codecAcceptsHugeBinaryValues("field"));
w.rollback();
d.close();
return;
}
}
}
DirectoryReader r;
try {
r = DirectoryReader.open(w);
} catch (IllegalArgumentException iae) {
if (iae.getMessage().indexOf("is too large") == -1) {
throw iae;
} else {
assertFalse(codecAcceptsHugeBinaryValues("field"));
// OK: some codecs can't handle binary DV > 32K
w.rollback();
d.close();
return;
}
}
w.close();
LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
for (int docID = 0; docID < docBytes.size(); docID++) {
Document doc = ar.document(docID);
assertEquals(docID, s.nextDoc());
BytesRef bytes = s.binaryValue();
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
assertEquals(expected.length, bytes.length);
assertEquals(new BytesRef(expected), bytes);
}
assertTrue(codecAcceptsHugeBinaryValues("field"));
ar.close();
d.close();
}
use of org.apache.lucene.document.BinaryDocValuesField in project lucene-solr by apache.
the class FacetsConfig method processFacetFields.
private void processFacetFields(TaxonomyWriter taxoWriter, Map<String, List<FacetField>> byField, Document doc) throws IOException {
for (Map.Entry<String, List<FacetField>> ent : byField.entrySet()) {
String indexFieldName = ent.getKey();
//System.out.println(" indexFieldName=" + indexFieldName + " fields=" + ent.getValue());
IntsRefBuilder ordinals = new IntsRefBuilder();
for (FacetField facetField : ent.getValue()) {
FacetsConfig.DimConfig ft = getDimConfig(facetField.dim);
if (facetField.path.length > 1 && ft.hierarchical == false) {
throw new IllegalArgumentException("dimension \"" + facetField.dim + "\" is not hierarchical yet has " + facetField.path.length + " components");
}
FacetLabel cp = new FacetLabel(facetField.dim, facetField.path);
checkTaxoWriter(taxoWriter);
int ordinal = taxoWriter.addCategory(cp);
ordinals.append(ordinal);
if (ft.multiValued && (ft.hierarchical || ft.requireDimCount)) {
//System.out.println(" add parents");
// Add all parents too:
int parent = taxoWriter.getParent(ordinal);
while (parent > 0) {
ordinals.append(parent);
parent = taxoWriter.getParent(parent);
}
if (ft.requireDimCount == false) {
// Remove last (dimension) ord:
ordinals.setLength(ordinals.length() - 1);
}
}
// Drill down:
for (int i = 1; i <= cp.length; i++) {
doc.add(new StringField(indexFieldName, pathToString(cp.components, i), Field.Store.NO));
}
}
// Facet counts:
// DocValues are considered stored fields:
doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get())));
}
}
use of org.apache.lucene.document.BinaryDocValuesField in project lucene-solr by apache.
the class FacetsConfig method processAssocFacetFields.
private void processAssocFacetFields(TaxonomyWriter taxoWriter, Map<String, List<AssociationFacetField>> byField, Document doc) throws IOException {
for (Map.Entry<String, List<AssociationFacetField>> ent : byField.entrySet()) {
byte[] bytes = new byte[16];
int upto = 0;
String indexFieldName = ent.getKey();
for (AssociationFacetField field : ent.getValue()) {
// NOTE: we don't add parents for associations
checkTaxoWriter(taxoWriter);
FacetLabel label = new FacetLabel(field.dim, field.path);
int ordinal = taxoWriter.addCategory(label);
if (upto + 4 > bytes.length) {
bytes = ArrayUtil.grow(bytes, upto + 4);
}
// big-endian:
bytes[upto++] = (byte) (ordinal >> 24);
bytes[upto++] = (byte) (ordinal >> 16);
bytes[upto++] = (byte) (ordinal >> 8);
bytes[upto++] = (byte) ordinal;
if (upto + field.assoc.length > bytes.length) {
bytes = ArrayUtil.grow(bytes, upto + field.assoc.length);
}
System.arraycopy(field.assoc.bytes, field.assoc.offset, bytes, upto, field.assoc.length);
upto += field.assoc.length;
// Drill down:
for (int i = 1; i <= label.length; i++) {
doc.add(new StringField(indexFieldName, pathToString(label.components, i), Field.Store.NO));
}
}
doc.add(new BinaryDocValuesField(indexFieldName, new BytesRef(bytes, 0, upto)));
}
}
use of org.apache.lucene.document.BinaryDocValuesField in project lucene-solr by apache.
the class TestBinaryDocValuesUpdates method testUpdateDocumentByMultipleTerms.
public void testUpdateDocumentByMultipleTerms() throws Exception {
// make sure the order of updates is respected, even when multiple terms affect same document
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
doc.add(new StringField("k1", "v1", Store.NO));
doc.add(new StringField("k2", "v2", Store.NO));
doc.add(new BinaryDocValuesField("bdv", toBytes(5L)));
// flushed document
writer.addDocument(doc);
writer.commit();
// in-memory document
writer.addDocument(doc);
writer.updateBinaryDocValue(new Term("k1", "v1"), "bdv", toBytes(17L));
writer.updateBinaryDocValue(new Term("k2", "v2"), "bdv", toBytes(3L));
writer.close();
final DirectoryReader reader = DirectoryReader.open(dir);
BinaryDocValues bdv = MultiDocValues.getBinaryValues(reader, "bdv");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, bdv.nextDoc());
assertEquals(3, getValue(bdv));
}
reader.close();
dir.close();
}
Aggregations