use of org.apache.druid.segment.writeout.SegmentWriteOutMedium in project druid by druid-io.
the class V3CompressedVSizeColumnarMultiIntsSerializerTest method generateV2SerializedSizeAndData.
private void generateV2SerializedSizeAndData(long numRows, int maxValue, int maxValuesPerRow, int offsetChunkFactor, int valueChunkFactor) throws Exception {
File tmpDirectory = FileUtils.createTempDir(StringUtils.format("CompressedVSizeIndexedV3WriterTest_%d_%d", offsetChunkFactor, offsetChunkFactor));
FileSmoosher smoosher = new FileSmoosher(tmpDirectory);
try (SegmentWriteOutMedium segmentWriteOutMedium = TmpFileSegmentWriteOutMediumFactory.instance().makeSegmentWriteOutMedium(temporaryFolder.newFolder())) {
CompressedColumnarIntsSerializer offsetWriter = new CompressedColumnarIntsSerializer(TEST_COLUMN_NAME, segmentWriteOutMedium, offsetChunkFactor, byteOrder, compressionStrategy, GenericIndexedWriter.ofCompressedByteBuffers(segmentWriteOutMedium, "offset", compressionStrategy, Long.BYTES * 250000));
GenericIndexedWriter genericIndexed = GenericIndexedWriter.ofCompressedByteBuffers(segmentWriteOutMedium, "value", compressionStrategy, Long.BYTES * 250000);
CompressedVSizeColumnarIntsSerializer valueWriter = new CompressedVSizeColumnarIntsSerializer(TEST_COLUMN_NAME, segmentWriteOutMedium, maxValue, valueChunkFactor, byteOrder, compressionStrategy, genericIndexed);
V3CompressedVSizeColumnarMultiIntsSerializer writer = new V3CompressedVSizeColumnarMultiIntsSerializer(TEST_COLUMN_NAME, offsetWriter, valueWriter);
writer.open();
for (long l = 0L; l < numRows; l++) {
writer.addValues(new ArrayBasedIndexedInts(generateRow(rand, maxValue, maxValuesPerRow)));
}
final SmooshedWriter channel = smoosher.addWithSmooshedWriter("test", writer.getSerializedSize());
writer.writeTo(channel, smoosher);
channel.close();
smoosher.close();
SmooshedFileMapper mapper = Smoosh.map(tmpDirectory);
V3CompressedVSizeColumnarMultiIntsSupplier supplierFromByteBuffer = V3CompressedVSizeColumnarMultiIntsSupplier.fromByteBuffer(mapper.mapFile("test"), byteOrder, mapper);
ColumnarMultiInts columnarMultiInts = supplierFromByteBuffer.get();
Assert.assertEquals(columnarMultiInts.size(), numRows);
Random verifier = new Random(0);
for (int i = 0; i < numRows; ++i) {
IndexedInts subVals = columnarMultiInts.get(i);
int[] expected = generateRow(verifier, maxValue, maxValuesPerRow);
Assert.assertEquals(subVals.size(), expected.length);
for (int j = 0, size = subVals.size(); j < size; ++j) {
Assert.assertEquals(subVals.get(j), expected[j]);
}
}
CloseableUtils.closeAll(columnarMultiInts, mapper);
}
}
use of org.apache.druid.segment.writeout.SegmentWriteOutMedium in project druid by druid-io.
the class LargeColumnSupportedComplexColumnSerializerTest method testSanity.
@Test
public void testSanity() throws IOException {
HyperUniquesSerdeForTest serde = new HyperUniquesSerdeForTest(Hashing.murmur3_128());
int[] cases = { 1000, 5000, 10000, 20000 };
int[] columnSizes = { Integer.MAX_VALUE, Integer.MAX_VALUE / 2, Integer.MAX_VALUE / 4, 5000 * Long.BYTES, 2500 * Long.BYTES };
for (int columnSize : columnSizes) {
for (int aCase : cases) {
File tmpFile = temporaryFolder.newFolder();
HyperLogLogCollector baseCollector = HyperLogLogCollector.makeLatestCollector();
try (SegmentWriteOutMedium segmentWriteOutMedium = new OffHeapMemorySegmentWriteOutMedium();
FileSmoosher v9Smoosher = new FileSmoosher(tmpFile)) {
LargeColumnSupportedComplexColumnSerializer serializer = LargeColumnSupportedComplexColumnSerializer.createWithColumnSize(segmentWriteOutMedium, "test", serde.getObjectStrategy(), columnSize);
serializer.open();
for (int i = 0; i < aCase; i++) {
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
byte[] hashBytes = fn.hashLong(i).asBytes();
collector.add(hashBytes);
baseCollector.fold(collector);
serializer.serialize(new ObjectColumnSelector() {
@Nullable
@Override
public Object getObject() {
return collector;
}
@Override
public Class classOfObject() {
return HyperLogLogCollector.class;
}
@Override
public void inspectRuntimeShape(RuntimeShapeInspector inspector) {
// doesn't matter in tests
}
});
}
try (final SmooshedWriter channel = v9Smoosher.addWithSmooshedWriter("test", serializer.getSerializedSize())) {
serializer.writeTo(channel, v9Smoosher);
}
}
SmooshedFileMapper mapper = Smoosh.map(tmpFile);
final ColumnBuilder builder = new ColumnBuilder().setType(ValueType.COMPLEX).setHasMultipleValues(false).setFileMapper(mapper);
serde.deserializeColumn(mapper.mapFile("test"), builder, null);
ColumnHolder columnHolder = builder.build();
ComplexColumn complexColumn = (ComplexColumn) columnHolder.getColumn();
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < aCase; i++) {
collector.fold((HyperLogLogCollector) complexColumn.getRowValue(i));
}
Assert.assertEquals(baseCollector.estimateCardinality(), collector.estimateCardinality(), 0.0);
}
}
}
use of org.apache.druid.segment.writeout.SegmentWriteOutMedium in project druid by druid-io.
the class CompressedColumnarIntsSerializerTest method testTooManyValues.
// this test takes ~30 minutes to run
@Ignore
@Test
public void testTooManyValues() throws IOException {
expectedException.expect(ColumnCapacityExceededException.class);
expectedException.expectMessage(ColumnCapacityExceededException.formatMessage("test"));
try (SegmentWriteOutMedium segmentWriteOutMedium = TmpFileSegmentWriteOutMediumFactory.instance().makeSegmentWriteOutMedium(temporaryFolder.newFolder())) {
CompressedColumnarIntsSerializer serializer = new CompressedColumnarIntsSerializer("test", segmentWriteOutMedium, "test", CompressedColumnarIntsSupplier.MAX_INTS_IN_BUFFER, byteOrder, compressionStrategy);
serializer.open();
final long numRows = Integer.MAX_VALUE + 100L;
for (long i = 0L; i < numRows; i++) {
serializer.addValue(ThreadLocalRandom.current().nextInt(0, Integer.MAX_VALUE));
}
}
}
use of org.apache.druid.segment.writeout.SegmentWriteOutMedium in project druid by druid-io.
the class CompressedDoublesSerdeTest method testTooManyValues.
// this test takes ~45 minutes to run
@Ignore
@Test
public void testTooManyValues() throws IOException {
expectedException.expect(ColumnCapacityExceededException.class);
expectedException.expectMessage(ColumnCapacityExceededException.formatMessage("test"));
try (SegmentWriteOutMedium segmentWriteOutMedium = TmpFileSegmentWriteOutMediumFactory.instance().makeSegmentWriteOutMedium(temporaryFolder.newFolder())) {
ColumnarDoublesSerializer serializer = CompressionFactory.getDoubleSerializer("test", segmentWriteOutMedium, "test", order, compressionStrategy);
serializer.open();
final long numRows = Integer.MAX_VALUE + 100L;
for (long i = 0L; i < numRows; i++) {
serializer.add(ThreadLocalRandom.current().nextDouble());
}
}
}
use of org.apache.druid.segment.writeout.SegmentWriteOutMedium in project druid by druid-io.
the class IndexMergerV9 method makeIndexFiles.
private File makeIndexFiles(final List<IndexableAdapter> adapters, @Nullable final AggregatorFactory[] metricAggs, final File outDir, final ProgressIndicator progress, final List<String> mergedDimensions, final List<String> mergedMetrics, final Function<List<TransformableRowIterator>, TimeAndDimsIterator> rowMergerFn, final boolean fillRowNumConversions, final IndexSpec indexSpec, @Nullable final SegmentWriteOutMediumFactory segmentWriteOutMediumFactory) throws IOException {
progress.start();
progress.progress();
List<Metadata> metadataList = Lists.transform(adapters, IndexableAdapter::getMetadata);
final Metadata segmentMetadata;
if (metricAggs != null) {
AggregatorFactory[] combiningMetricAggs = new AggregatorFactory[metricAggs.length];
for (int i = 0; i < metricAggs.length; i++) {
combiningMetricAggs[i] = metricAggs[i].getCombiningFactory();
}
segmentMetadata = Metadata.merge(metadataList, combiningMetricAggs);
} else {
segmentMetadata = Metadata.merge(metadataList, null);
}
Closer closer = Closer.create();
try {
final FileSmoosher v9Smoosher = new FileSmoosher(outDir);
FileUtils.mkdirp(outDir);
SegmentWriteOutMediumFactory omf = segmentWriteOutMediumFactory != null ? segmentWriteOutMediumFactory : defaultSegmentWriteOutMediumFactory;
log.debug("Using SegmentWriteOutMediumFactory[%s]", omf.getClass().getSimpleName());
SegmentWriteOutMedium segmentWriteOutMedium = omf.makeSegmentWriteOutMedium(outDir);
closer.register(segmentWriteOutMedium);
long startTime = System.currentTimeMillis();
Files.asByteSink(new File(outDir, "version.bin")).write(Ints.toByteArray(IndexIO.V9_VERSION));
log.debug("Completed version.bin in %,d millis.", System.currentTimeMillis() - startTime);
progress.progress();
startTime = System.currentTimeMillis();
try (FileOutputStream fos = new FileOutputStream(new File(outDir, "factory.json"))) {
SegmentizerFactory customSegmentLoader = indexSpec.getSegmentLoader();
if (customSegmentLoader != null) {
mapper.writeValue(fos, customSegmentLoader);
} else {
mapper.writeValue(fos, new MMappedQueryableSegmentizerFactory(indexIO));
}
}
log.debug("Completed factory.json in %,d millis", System.currentTimeMillis() - startTime);
progress.progress();
final Map<String, ValueType> metricsValueTypes = new TreeMap<>(Comparators.naturalNullsFirst());
final Map<String, String> metricTypeNames = new TreeMap<>(Comparators.naturalNullsFirst());
final List<ColumnCapabilities> dimCapabilities = Lists.newArrayListWithCapacity(mergedDimensions.size());
mergeCapabilities(adapters, mergedDimensions, metricsValueTypes, metricTypeNames, dimCapabilities);
final Map<String, DimensionHandler> handlers = makeDimensionHandlers(mergedDimensions, dimCapabilities);
final List<DimensionMergerV9> mergers = new ArrayList<>();
for (int i = 0; i < mergedDimensions.size(); i++) {
DimensionHandler handler = handlers.get(mergedDimensions.get(i));
mergers.add(handler.makeMerger(indexSpec, segmentWriteOutMedium, dimCapabilities.get(i), progress, closer));
}
/**
*********** Setup Dim Conversions *************
*/
progress.progress();
startTime = System.currentTimeMillis();
writeDimValuesAndSetupDimConversion(adapters, progress, mergedDimensions, mergers);
log.debug("Completed dim conversions in %,d millis.", System.currentTimeMillis() - startTime);
/**
*********** Walk through data sets, merge them, and write merged columns ************
*/
progress.progress();
final TimeAndDimsIterator timeAndDimsIterator = makeMergedTimeAndDimsIterator(adapters, mergedDimensions, mergedMetrics, rowMergerFn, handlers, mergers);
closer.register(timeAndDimsIterator);
final GenericColumnSerializer timeWriter = setupTimeWriter(segmentWriteOutMedium, indexSpec);
final ArrayList<GenericColumnSerializer> metricWriters = setupMetricsWriters(segmentWriteOutMedium, mergedMetrics, metricsValueTypes, metricTypeNames, indexSpec);
List<IntBuffer> rowNumConversions = mergeIndexesAndWriteColumns(adapters, progress, timeAndDimsIterator, timeWriter, metricWriters, mergers, fillRowNumConversions);
/**
********** Create Inverted Indexes and Finalize Build Columns ************
*/
final String section = "build inverted index and columns";
progress.startSection(section);
makeTimeColumn(v9Smoosher, progress, timeWriter, indexSpec);
makeMetricsColumns(v9Smoosher, progress, mergedMetrics, metricsValueTypes, metricTypeNames, metricWriters, indexSpec);
for (int i = 0; i < mergedDimensions.size(); i++) {
DimensionMergerV9 merger = mergers.get(i);
merger.writeIndexes(rowNumConversions);
if (merger.canSkip()) {
continue;
}
ColumnDescriptor columnDesc = merger.makeColumnDescriptor();
makeColumn(v9Smoosher, mergedDimensions.get(i), columnDesc);
}
progress.stopSection(section);
/**
*********** Make index.drd & metadata.drd files *************
*/
progress.progress();
makeIndexBinary(v9Smoosher, adapters, outDir, mergedDimensions, mergedMetrics, progress, indexSpec, mergers);
makeMetadataBinary(v9Smoosher, progress, segmentMetadata);
v9Smoosher.close();
progress.stop();
return outDir;
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
}
Aggregations