Search in sources :

Example 1 with TmpFileIOPeon

use of io.druid.segment.data.TmpFileIOPeon in project druid by druid-io.

the class LargeColumnSupportedComplexColumnSerializerTest method testSanity.

@Test
public void testSanity() throws IOException {
    HyperUniquesSerdeForTest serde = new HyperUniquesSerdeForTest(Hashing.murmur3_128());
    int[] cases = { 1000, 5000, 10000, 20000 };
    int[] columnSizes = { Integer.MAX_VALUE, Integer.MAX_VALUE / 2, Integer.MAX_VALUE / 4, 5000 * Longs.BYTES, 2500 * Longs.BYTES };
    for (int columnSize : columnSizes) {
        for (int aCase : cases) {
            File tmpFile = FileUtils.getTempDirectory();
            HyperLogLogCollector baseCollector = HyperLogLogCollector.makeLatestCollector();
            try (IOPeon peon = new TmpFileIOPeon();
                FileSmoosher v9Smoosher = new FileSmoosher(tmpFile)) {
                LargeColumnSupportedComplexColumnSerializer serializer = LargeColumnSupportedComplexColumnSerializer.createWithColumnSize(peon, "test", serde.getObjectStrategy(), columnSize);
                serializer.open();
                for (int i = 0; i < aCase; i++) {
                    HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
                    byte[] hashBytes = fn.hashLong(i).asBytes();
                    collector.add(hashBytes);
                    baseCollector.fold(collector);
                    serializer.serialize(collector);
                }
                serializer.close();
                try (final SmooshedWriter channel = v9Smoosher.addWithSmooshedWriter("test", serializer.getSerializedSize())) {
                    serializer.writeToChannel(channel, v9Smoosher);
                }
            }
            SmooshedFileMapper mapper = Smoosh.map(tmpFile);
            final ColumnBuilder builder = new ColumnBuilder().setType(ValueType.COMPLEX).setHasMultipleValues(false).setFileMapper(mapper);
            serde.deserializeColumn(mapper.mapFile("test"), builder);
            Column column = builder.build();
            ComplexColumn complexColumn = column.getComplexColumn();
            HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
            for (int i = 0; i < aCase; i++) {
                collector.fold((HyperLogLogCollector) complexColumn.getRowValue(i));
            }
            Assert.assertEquals(baseCollector.estimateCardinality(), collector.estimateCardinality(), 0.0);
        }
    }
}
Also used : SmooshedWriter(io.druid.java.util.common.io.smoosh.SmooshedWriter) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) IOPeon(io.druid.segment.data.IOPeon) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) Column(io.druid.segment.column.Column) ComplexColumn(io.druid.segment.column.ComplexColumn) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) FileSmoosher(io.druid.java.util.common.io.smoosh.FileSmoosher) ColumnBuilder(io.druid.segment.column.ColumnBuilder) File(java.io.File) SmooshedFileMapper(io.druid.java.util.common.io.smoosh.SmooshedFileMapper) ComplexColumn(io.druid.segment.column.ComplexColumn) Test(org.junit.Test)

Example 2 with TmpFileIOPeon

use of io.druid.segment.data.TmpFileIOPeon in project druid by druid-io.

the class LongCompressionBenchmarkFileGenerator method main.

public static void main(String[] args) throws IOException, URISyntaxException {
    if (args.length >= 1) {
        dirPath = args[0];
    }
    BenchmarkColumnSchema enumeratedSchema = BenchmarkColumnSchema.makeEnumerated("", ValueType.LONG, true, 1, 0d, ImmutableList.<Object>of(0, 1, 2, 3, 4), ImmutableList.of(0.95, 0.001, 0.0189, 0.03, 0.0001));
    BenchmarkColumnSchema zipfLowSchema = BenchmarkColumnSchema.makeZipf("", ValueType.LONG, true, 1, 0d, -1, 1000, 1d);
    BenchmarkColumnSchema zipfHighSchema = BenchmarkColumnSchema.makeZipf("", ValueType.LONG, true, 1, 0d, -1, 1000, 3d);
    BenchmarkColumnSchema sequentialSchema = BenchmarkColumnSchema.makeSequential("", ValueType.LONG, true, 1, 0d, 1470187671, 2000000000);
    BenchmarkColumnSchema uniformSchema = BenchmarkColumnSchema.makeDiscreteUniform("", ValueType.LONG, true, 1, 0d, 0, 1000);
    Map<String, BenchmarkColumnValueGenerator> generators = new HashMap<>();
    generators.put("enumerate", new BenchmarkColumnValueGenerator(enumeratedSchema, 1));
    generators.put("zipfLow", new BenchmarkColumnValueGenerator(zipfLowSchema, 1));
    generators.put("zipfHigh", new BenchmarkColumnValueGenerator(zipfHighSchema, 1));
    generators.put("sequential", new BenchmarkColumnValueGenerator(sequentialSchema, 1));
    generators.put("uniform", new BenchmarkColumnValueGenerator(uniformSchema, 1));
    File dir = new File(dirPath);
    dir.mkdir();
    // create data files using BenchmarkColunValueGenerator
    for (Map.Entry<String, BenchmarkColumnValueGenerator> entry : generators.entrySet()) {
        final File dataFile = new File(dir, entry.getKey());
        dataFile.delete();
        try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dataFile)))) {
            for (int i = 0; i < ROW_NUM; i++) {
                writer.write((long) entry.getValue().generateRowValue() + "\n");
            }
        }
    }
    // create compressed files using all combinations of CompressionStrategy and LongEncoding provided
    for (Map.Entry<String, BenchmarkColumnValueGenerator> entry : generators.entrySet()) {
        for (CompressedObjectStrategy.CompressionStrategy compression : compressions) {
            for (CompressionFactory.LongEncodingStrategy encoding : encodings) {
                String name = entry.getKey() + "-" + compression.toString() + "-" + encoding.toString();
                System.out.print(name + ": ");
                File compFile = new File(dir, name);
                compFile.delete();
                File dataFile = new File(dir, entry.getKey());
                TmpFileIOPeon iopeon = new TmpFileIOPeon(true);
                LongSupplierSerializer writer = CompressionFactory.getLongSerializer(iopeon, "long", ByteOrder.nativeOrder(), encoding, compression);
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile)));
                try (FileChannel output = FileChannel.open(compFile.toPath(), StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE)) {
                    writer.open();
                    String line;
                    while ((line = br.readLine()) != null) {
                        writer.add(Long.parseLong(line));
                    }
                    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
                    writer.closeAndConsolidate(new ByteSink() {

                        @Override
                        public OutputStream openStream() throws IOException {
                            return baos;
                        }
                    });
                    output.write(ByteBuffer.wrap(baos.toByteArray()));
                } finally {
                    iopeon.close();
                    br.close();
                }
                System.out.print(compFile.length() / 1024 + "\n");
            }
        }
    }
}
Also used : HashMap(java.util.HashMap) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) CompressedObjectStrategy(io.druid.segment.data.CompressedObjectStrategy) BufferedWriter(java.io.BufferedWriter) ByteSink(com.google.common.io.ByteSink) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) InputStreamReader(java.io.InputStreamReader) FileChannel(java.nio.channels.FileChannel) BenchmarkColumnValueGenerator(io.druid.benchmark.datagen.BenchmarkColumnValueGenerator) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) CompressionFactory(io.druid.segment.data.CompressionFactory) BenchmarkColumnSchema(io.druid.benchmark.datagen.BenchmarkColumnSchema) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) LongSupplierSerializer(io.druid.segment.data.LongSupplierSerializer)

Example 3 with TmpFileIOPeon

use of io.druid.segment.data.TmpFileIOPeon in project druid by druid-io.

the class IndexMerger method makeIndexFiles.

protected File makeIndexFiles(final List<IndexableAdapter> indexes, final AggregatorFactory[] metricAggs, final File outDir, final ProgressIndicator progress, final List<String> mergedDimensions, final List<String> mergedMetrics, final Function<ArrayList<Iterable<Rowboat>>, Iterable<Rowboat>> rowMergerFn, final IndexSpec indexSpec) throws IOException {
    List<Metadata> metadataList = Lists.transform(indexes, new Function<IndexableAdapter, Metadata>() {

        @Nullable
        @Override
        public Metadata apply(IndexableAdapter input) {
            return input.getMetadata();
        }
    });
    Metadata segmentMetadata = null;
    if (metricAggs != null) {
        AggregatorFactory[] combiningMetricAggs = new AggregatorFactory[metricAggs.length];
        for (int i = 0; i < metricAggs.length; i++) {
            combiningMetricAggs[i] = metricAggs[i].getCombiningFactory();
        }
        segmentMetadata = Metadata.merge(metadataList, combiningMetricAggs);
    } else {
        segmentMetadata = Metadata.merge(metadataList, null);
    }
    final Map<String, ValueType> valueTypes = Maps.newTreeMap(Ordering.<String>natural().nullsFirst());
    final Map<String, String> metricTypeNames = Maps.newTreeMap(Ordering.<String>natural().nullsFirst());
    final Map<String, ColumnCapabilitiesImpl> columnCapabilities = Maps.newHashMap();
    final List<ColumnCapabilitiesImpl> dimCapabilities = new ArrayList<>();
    for (IndexableAdapter adapter : indexes) {
        for (String dimension : adapter.getDimensionNames()) {
            ColumnCapabilitiesImpl mergedCapabilities = columnCapabilities.get(dimension);
            ColumnCapabilities capabilities = adapter.getCapabilities(dimension);
            if (mergedCapabilities == null) {
                mergedCapabilities = new ColumnCapabilitiesImpl();
            }
            columnCapabilities.put(dimension, mergedCapabilities.merge(capabilities));
        }
        for (String metric : adapter.getMetricNames()) {
            ColumnCapabilitiesImpl mergedCapabilities = columnCapabilities.get(metric);
            ColumnCapabilities capabilities = adapter.getCapabilities(metric);
            if (mergedCapabilities == null) {
                mergedCapabilities = new ColumnCapabilitiesImpl();
            }
            columnCapabilities.put(metric, mergedCapabilities.merge(capabilities));
            valueTypes.put(metric, capabilities.getType());
            metricTypeNames.put(metric, adapter.getMetricType(metric));
        }
    }
    for (String dimension : mergedDimensions) {
        dimCapabilities.add(columnCapabilities.get(dimension));
    }
    Closer closer = Closer.create();
    try {
        final Interval dataInterval;
        final File v8OutDir = new File(outDir, "v8-tmp");
        FileUtils.forceMkdir(v8OutDir);
        registerDeleteDirectory(closer, v8OutDir);
        File tmpPeonFilesDir = new File(v8OutDir, "tmpPeonFiles");
        FileUtils.forceMkdir(tmpPeonFilesDir);
        registerDeleteDirectory(closer, tmpPeonFilesDir);
        final IOPeon ioPeon = new TmpFileIOPeon(tmpPeonFilesDir, true);
        closer.register(ioPeon);
        /*************  Main index.drd file **************/
        progress.progress();
        long startTime = System.currentTimeMillis();
        File indexFile = new File(v8OutDir, "index.drd");
        try (FileOutputStream fileOutputStream = new FileOutputStream(indexFile);
            FileChannel channel = fileOutputStream.getChannel()) {
            channel.write(ByteBuffer.wrap(new byte[] { IndexIO.V8_VERSION }));
            GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.STRING_STRATEGY).writeToChannel(channel);
            GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.STRING_STRATEGY).writeToChannel(channel);
            DateTime minTime = new DateTime(JodaUtils.MAX_INSTANT);
            DateTime maxTime = new DateTime(JodaUtils.MIN_INSTANT);
            for (IndexableAdapter index : indexes) {
                minTime = JodaUtils.minDateTime(minTime, index.getDataInterval().getStart());
                maxTime = JodaUtils.maxDateTime(maxTime, index.getDataInterval().getEnd());
            }
            dataInterval = new Interval(minTime, maxTime);
            serializerUtils.writeString(channel, String.format("%s/%s", minTime, maxTime));
            serializerUtils.writeString(channel, mapper.writeValueAsString(indexSpec.getBitmapSerdeFactory()));
        }
        IndexIO.checkFileSize(indexFile);
        log.info("outDir[%s] completed index.drd in %,d millis.", v8OutDir, System.currentTimeMillis() - startTime);
        /************* Setup Dim Conversions **************/
        progress.progress();
        startTime = System.currentTimeMillis();
        final ArrayList<FileOutputSupplier> dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size());
        final DimensionHandler[] handlers = makeDimensionHandlers(mergedDimensions, dimCapabilities);
        final List<DimensionMerger> mergers = new ArrayList<>();
        for (int i = 0; i < mergedDimensions.size(); i++) {
            DimensionMergerLegacy merger = handlers[i].makeLegacyMerger(indexSpec, v8OutDir, ioPeon, dimCapabilities.get(i), progress);
            mergers.add(merger);
            merger.writeMergedValueMetadata(indexes);
            FileOutputSupplier dimOut = new FileOutputSupplier(merger.makeDimFile(), true);
            merger.writeValueMetadataToFile(dimOut);
            dimOuts.add(dimOut);
        }
        log.info("outDir[%s] completed dim conversions in %,d millis.", v8OutDir, System.currentTimeMillis() - startTime);
        /************* Walk through data sets and merge them *************/
        progress.progress();
        startTime = System.currentTimeMillis();
        Iterable<Rowboat> theRows = makeRowIterable(indexes, mergedDimensions, mergedMetrics, rowMergerFn, dimCapabilities, handlers, mergers);
        LongSupplierSerializer timeWriter = CompressionFactory.getLongSerializer(ioPeon, "little_end_time", IndexIO.BYTE_ORDER, indexSpec.getLongEncoding(), CompressedObjectStrategy.DEFAULT_COMPRESSION_STRATEGY);
        timeWriter.open();
        ArrayList<MetricColumnSerializer> metWriters = Lists.newArrayListWithCapacity(mergedMetrics.size());
        final CompressedObjectStrategy.CompressionStrategy metCompression = indexSpec.getMetricCompression();
        final CompressionFactory.LongEncodingStrategy longEncoding = indexSpec.getLongEncoding();
        for (String metric : mergedMetrics) {
            ValueType type = valueTypes.get(metric);
            switch(type) {
                case LONG:
                    metWriters.add(new LongMetricColumnSerializer(metric, v8OutDir, ioPeon, metCompression, longEncoding));
                    break;
                case FLOAT:
                    metWriters.add(new FloatMetricColumnSerializer(metric, v8OutDir, ioPeon, metCompression));
                    break;
                case COMPLEX:
                    final String typeName = metricTypeNames.get(metric);
                    ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(typeName);
                    if (serde == null) {
                        throw new ISE("Unknown type[%s]", typeName);
                    }
                    metWriters.add(new ComplexMetricColumnSerializer(metric, v8OutDir, ioPeon, serde));
                    break;
                default:
                    throw new ISE("Unknown type[%s]", type);
            }
        }
        for (MetricColumnSerializer metWriter : metWriters) {
            metWriter.open();
        }
        int rowCount = 0;
        long time = System.currentTimeMillis();
        List<IntBuffer> rowNumConversions = Lists.newArrayListWithCapacity(indexes.size());
        for (IndexableAdapter index : indexes) {
            int[] arr = new int[index.getNumRows()];
            Arrays.fill(arr, INVALID_ROW);
            rowNumConversions.add(IntBuffer.wrap(arr));
        }
        for (Rowboat theRow : theRows) {
            progress.progress();
            timeWriter.add(theRow.getTimestamp());
            final Object[] metrics = theRow.getMetrics();
            for (int i = 0; i < metrics.length; ++i) {
                metWriters.get(i).serialize(metrics[i]);
            }
            Object[] dims = theRow.getDims();
            for (int i = 0; i < dims.length; ++i) {
                mergers.get(i).processMergedRow(dims[i]);
            }
            for (Map.Entry<Integer, TreeSet<Integer>> comprisedRow : theRow.getComprisedRows().entrySet()) {
                final IntBuffer conversionBuffer = rowNumConversions.get(comprisedRow.getKey());
                for (Integer rowNum : comprisedRow.getValue()) {
                    while (conversionBuffer.position() < rowNum) {
                        conversionBuffer.put(INVALID_ROW);
                    }
                    conversionBuffer.put(rowCount);
                }
            }
            if ((++rowCount % 500000) == 0) {
                log.info("outDir[%s] walked 500,000/%,d rows in %,d millis.", v8OutDir, rowCount, System.currentTimeMillis() - time);
                time = System.currentTimeMillis();
            }
        }
        for (IntBuffer rowNumConversion : rowNumConversions) {
            rowNumConversion.rewind();
        }
        final File timeFile = IndexIO.makeTimeFile(v8OutDir, IndexIO.BYTE_ORDER);
        timeFile.delete();
        ByteSink out = Files.asByteSink(timeFile, FileWriteMode.APPEND);
        timeWriter.closeAndConsolidate(out);
        IndexIO.checkFileSize(timeFile);
        for (MetricColumnSerializer metWriter : metWriters) {
            metWriter.close();
        }
        log.info("outDir[%s] completed walk through of %,d rows in %,d millis.", v8OutDir, rowCount, System.currentTimeMillis() - startTime);
        /************ Create Inverted Indexes and Finalize Columns *************/
        startTime = System.currentTimeMillis();
        final File invertedFile = new File(v8OutDir, "inverted.drd");
        Files.touch(invertedFile);
        out = Files.asByteSink(invertedFile, FileWriteMode.APPEND);
        final File geoFile = new File(v8OutDir, "spatial.drd");
        Files.touch(geoFile);
        OutputSupplier<FileOutputStream> spatialOut = Files.newOutputStreamSupplier(geoFile, true);
        for (int i = 0; i < mergedDimensions.size(); i++) {
            DimensionMergerLegacy legacyMerger = (DimensionMergerLegacy) mergers.get(i);
            legacyMerger.writeIndexes(rowNumConversions, closer);
            legacyMerger.writeIndexesToFiles(out, spatialOut);
            legacyMerger.writeRowValuesToFile(dimOuts.get(i));
        }
        log.info("outDir[%s] completed inverted.drd and wrote dimensions in %,d millis.", v8OutDir, System.currentTimeMillis() - startTime);
        final Function<String, String> dimFilenameFunction = new Function<String, String>() {

            @Override
            public String apply(@Nullable String input) {
                String formatString;
                if (columnCapabilities.get(input).isDictionaryEncoded()) {
                    formatString = "dim_%s.drd";
                } else {
                    formatString = String.format("numeric_dim_%%s_%s.drd", IndexIO.BYTE_ORDER);
                }
                return GuavaUtils.formatFunction(formatString).apply(input);
            }
        };
        final ArrayList<String> expectedFiles = Lists.newArrayList(Iterables.concat(Arrays.asList("index.drd", "inverted.drd", "spatial.drd", String.format("time_%s.drd", IndexIO.BYTE_ORDER)), Iterables.transform(mergedDimensions, dimFilenameFunction), Iterables.transform(mergedMetrics, GuavaUtils.formatFunction(String.format("met_%%s_%s.drd", IndexIO.BYTE_ORDER)))));
        if (segmentMetadata != null) {
            writeMetadataToFile(new File(v8OutDir, "metadata.drd"), segmentMetadata);
            log.info("wrote metadata.drd in outDir[%s].", v8OutDir);
            expectedFiles.add("metadata.drd");
        }
        Map<String, File> files = Maps.newLinkedHashMap();
        for (String fileName : expectedFiles) {
            files.put(fileName, new File(v8OutDir, fileName));
        }
        File smooshDir = new File(v8OutDir, "smoosher");
        FileUtils.forceMkdir(smooshDir);
        for (Map.Entry<String, File> entry : Smoosh.smoosh(v8OutDir, smooshDir, files).entrySet()) {
            entry.getValue().delete();
        }
        for (File file : smooshDir.listFiles()) {
            Files.move(file, new File(v8OutDir, file.getName()));
        }
        if (!smooshDir.delete()) {
            log.info("Unable to delete temporary dir[%s], contains[%s]", smooshDir, Arrays.asList(smooshDir.listFiles()));
            throw new IOException(String.format("Unable to delete temporary dir[%s]", smooshDir));
        }
        createIndexDrdFile(IndexIO.V8_VERSION, v8OutDir, GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.STRING_STRATEGY), GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.STRING_STRATEGY), dataInterval, indexSpec.getBitmapSerdeFactory());
        indexIO.getDefaultIndexIOHandler().convertV8toV9(v8OutDir, outDir, indexSpec);
        return outDir;
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}
Also used : ArrayList(java.util.ArrayList) CompressedObjectStrategy(io.druid.segment.data.CompressedObjectStrategy) DateTime(org.joda.time.DateTime) ComplexMetricColumnSerializer(io.druid.segment.serde.ComplexMetricColumnSerializer) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) TreeSet(java.util.TreeSet) FileOutputStream(java.io.FileOutputStream) IntBuffer(java.nio.IntBuffer) File(java.io.File) Map(java.util.Map) Nullable(javax.annotation.Nullable) ColumnCapabilitiesImpl(io.druid.segment.column.ColumnCapabilitiesImpl) Interval(org.joda.time.Interval) ComplexMetricSerde(io.druid.segment.serde.ComplexMetricSerde) IOPeon(io.druid.segment.data.IOPeon) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) ColumnCapabilities(io.druid.segment.column.ColumnCapabilities) Function(com.google.common.base.Function) ByteSink(com.google.common.io.ByteSink) ISE(io.druid.java.util.common.ISE) ComplexMetricColumnSerializer(io.druid.segment.serde.ComplexMetricColumnSerializer) Closer(com.google.common.io.Closer) ValueType(io.druid.segment.column.ValueType) FileChannel(java.nio.channels.FileChannel) IOException(java.io.IOException) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) CompressionFactory(io.druid.segment.data.CompressionFactory) FileOutputSupplier(io.druid.common.guava.FileOutputSupplier) LongSupplierSerializer(io.druid.segment.data.LongSupplierSerializer)

Example 4 with TmpFileIOPeon

use of io.druid.segment.data.TmpFileIOPeon in project druid by druid-io.

the class FloatCompressionBenchmarkFileGenerator method main.

public static void main(String[] args) throws IOException, URISyntaxException {
    if (args.length >= 1) {
        dirPath = args[0];
    }
    BenchmarkColumnSchema enumeratedSchema = BenchmarkColumnSchema.makeEnumerated("", ValueType.FLOAT, true, 1, 0d, ImmutableList.<Object>of(0f, 1.1f, 2.2f, 3.3f, 4.4f), ImmutableList.of(0.95, 0.001, 0.0189, 0.03, 0.0001));
    BenchmarkColumnSchema zipfLowSchema = BenchmarkColumnSchema.makeZipf("", ValueType.FLOAT, true, 1, 0d, -1, 1000, 1d);
    BenchmarkColumnSchema zipfHighSchema = BenchmarkColumnSchema.makeZipf("", ValueType.FLOAT, true, 1, 0d, -1, 1000, 3d);
    BenchmarkColumnSchema sequentialSchema = BenchmarkColumnSchema.makeSequential("", ValueType.FLOAT, true, 1, 0d, 1470187671, 2000000000);
    BenchmarkColumnSchema uniformSchema = BenchmarkColumnSchema.makeContinuousUniform("", ValueType.FLOAT, true, 1, 0d, 0, 1000);
    Map<String, BenchmarkColumnValueGenerator> generators = new HashMap<>();
    generators.put("enumerate", new BenchmarkColumnValueGenerator(enumeratedSchema, 1));
    generators.put("zipfLow", new BenchmarkColumnValueGenerator(zipfLowSchema, 1));
    generators.put("zipfHigh", new BenchmarkColumnValueGenerator(zipfHighSchema, 1));
    generators.put("sequential", new BenchmarkColumnValueGenerator(sequentialSchema, 1));
    generators.put("uniform", new BenchmarkColumnValueGenerator(uniformSchema, 1));
    File dir = new File(dirPath);
    dir.mkdir();
    // create data files using BenchmarkColunValueGenerator
    for (Map.Entry<String, BenchmarkColumnValueGenerator> entry : generators.entrySet()) {
        final File dataFile = new File(dir, entry.getKey());
        dataFile.delete();
        try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dataFile)))) {
            for (int i = 0; i < ROW_NUM; i++) {
                writer.write((Float) entry.getValue().generateRowValue() + "\n");
            }
        }
    }
    // create compressed files using all combinations of CompressionStrategy and FloatEncoding provided
    for (Map.Entry<String, BenchmarkColumnValueGenerator> entry : generators.entrySet()) {
        for (CompressedObjectStrategy.CompressionStrategy compression : compressions) {
            String name = entry.getKey() + "-" + compression.toString();
            System.out.print(name + ": ");
            File compFile = new File(dir, name);
            compFile.delete();
            File dataFile = new File(dir, entry.getKey());
            TmpFileIOPeon iopeon = new TmpFileIOPeon(true);
            FloatSupplierSerializer writer = CompressionFactory.getFloatSerializer(iopeon, "float", ByteOrder.nativeOrder(), compression);
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile)));
            try (FileChannel output = FileChannel.open(compFile.toPath(), StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE)) {
                writer.open();
                String line;
                while ((line = br.readLine()) != null) {
                    writer.add(Float.parseFloat(line));
                }
                final ByteArrayOutputStream baos = new ByteArrayOutputStream();
                writer.closeAndConsolidate(new ByteSink() {

                    @Override
                    public OutputStream openStream() throws IOException {
                        return baos;
                    }
                });
                output.write(ByteBuffer.wrap(baos.toByteArray()));
            } finally {
                iopeon.close();
                br.close();
            }
            System.out.print(compFile.length() / 1024 + "\n");
        }
    }
}
Also used : HashMap(java.util.HashMap) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) CompressedObjectStrategy(io.druid.segment.data.CompressedObjectStrategy) BufferedWriter(java.io.BufferedWriter) ByteSink(com.google.common.io.ByteSink) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) FloatSupplierSerializer(io.druid.segment.data.FloatSupplierSerializer) InputStreamReader(java.io.InputStreamReader) FileChannel(java.nio.channels.FileChannel) BenchmarkColumnValueGenerator(io.druid.benchmark.datagen.BenchmarkColumnValueGenerator) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) BenchmarkColumnSchema(io.druid.benchmark.datagen.BenchmarkColumnSchema) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer)

Example 5 with TmpFileIOPeon

use of io.druid.segment.data.TmpFileIOPeon in project druid by druid-io.

the class IndexMergerV9 method makeIndexFiles.

@Override
protected File makeIndexFiles(final List<IndexableAdapter> adapters, final AggregatorFactory[] metricAggs, final File outDir, final ProgressIndicator progress, final List<String> mergedDimensions, final List<String> mergedMetrics, final Function<ArrayList<Iterable<Rowboat>>, Iterable<Rowboat>> rowMergerFn, final IndexSpec indexSpec) throws IOException {
    progress.start();
    progress.progress();
    List<Metadata> metadataList = Lists.transform(adapters, new Function<IndexableAdapter, Metadata>() {

        @Override
        public Metadata apply(IndexableAdapter input) {
            return input.getMetadata();
        }
    });
    Metadata segmentMetadata = null;
    if (metricAggs != null) {
        AggregatorFactory[] combiningMetricAggs = new AggregatorFactory[metricAggs.length];
        for (int i = 0; i < metricAggs.length; i++) {
            combiningMetricAggs[i] = metricAggs[i].getCombiningFactory();
        }
        segmentMetadata = Metadata.merge(metadataList, combiningMetricAggs);
    } else {
        segmentMetadata = Metadata.merge(metadataList, null);
    }
    Closer closer = Closer.create();
    try {
        final FileSmoosher v9Smoosher = new FileSmoosher(outDir);
        final File v9TmpDir = new File(outDir, "v9-tmp");
        FileUtils.forceMkdir(v9TmpDir);
        registerDeleteDirectory(closer, v9TmpDir);
        log.info("Start making v9 index files, outDir:%s", outDir);
        File tmpPeonFilesDir = new File(v9TmpDir, "tmpPeonFiles");
        FileUtils.forceMkdir(tmpPeonFilesDir);
        registerDeleteDirectory(closer, tmpPeonFilesDir);
        final IOPeon ioPeon = new TmpFileIOPeon(tmpPeonFilesDir, false);
        closer.register(ioPeon);
        long startTime = System.currentTimeMillis();
        ByteStreams.write(Ints.toByteArray(IndexIO.V9_VERSION), Files.newOutputStreamSupplier(new File(outDir, "version.bin")));
        log.info("Completed version.bin in %,d millis.", System.currentTimeMillis() - startTime);
        progress.progress();
        startTime = System.currentTimeMillis();
        try (FileOutputStream fos = new FileOutputStream(new File(outDir, "factory.json"))) {
            mapper.writeValue(fos, new MMappedQueryableSegmentizerFactory(indexIO));
        }
        log.info("Completed factory.json in %,d millis", System.currentTimeMillis() - startTime);
        progress.progress();
        final Map<String, ValueType> metricsValueTypes = Maps.newTreeMap(Ordering.<String>natural().nullsFirst());
        final Map<String, String> metricTypeNames = Maps.newTreeMap(Ordering.<String>natural().nullsFirst());
        final List<ColumnCapabilitiesImpl> dimCapabilities = Lists.newArrayListWithCapacity(mergedDimensions.size());
        mergeCapabilities(adapters, mergedDimensions, metricsValueTypes, metricTypeNames, dimCapabilities);
        final DimensionHandler[] handlers = makeDimensionHandlers(mergedDimensions, dimCapabilities);
        final List<DimensionMerger> mergers = new ArrayList<>();
        for (int i = 0; i < mergedDimensions.size(); i++) {
            mergers.add(handlers[i].makeMerger(indexSpec, v9TmpDir, ioPeon, dimCapabilities.get(i), progress));
        }
        /************* Setup Dim Conversions **************/
        progress.progress();
        startTime = System.currentTimeMillis();
        final ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(adapters.size());
        final ArrayList<Boolean> dimensionSkipFlag = Lists.newArrayListWithCapacity(mergedDimensions.size());
        final ArrayList<Boolean> convertMissingDimsFlags = Lists.newArrayListWithCapacity(mergedDimensions.size());
        writeDimValueAndSetupDimConversion(adapters, progress, mergedDimensions, mergers);
        log.info("Completed dim conversions in %,d millis.", System.currentTimeMillis() - startTime);
        /************* Walk through data sets, merge them, and write merged columns *************/
        progress.progress();
        final Iterable<Rowboat> theRows = makeRowIterable(adapters, mergedDimensions, mergedMetrics, rowMergerFn, dimCapabilities, handlers, mergers);
        final LongColumnSerializer timeWriter = setupTimeWriter(ioPeon, indexSpec);
        final ArrayList<GenericColumnSerializer> metWriters = setupMetricsWriters(ioPeon, mergedMetrics, metricsValueTypes, metricTypeNames, indexSpec);
        final List<IntBuffer> rowNumConversions = Lists.newArrayListWithCapacity(adapters.size());
        mergeIndexesAndWriteColumns(adapters, progress, theRows, timeWriter, metWriters, rowNumConversions, mergers);
        /************ Create Inverted Indexes and Finalize Build Columns *************/
        final String section = "build inverted index and columns";
        progress.startSection(section);
        makeTimeColumn(v9Smoosher, progress, timeWriter);
        makeMetricsColumns(v9Smoosher, progress, mergedMetrics, metricsValueTypes, metricTypeNames, metWriters);
        for (int i = 0; i < mergedDimensions.size(); i++) {
            DimensionMergerV9 merger = (DimensionMergerV9) mergers.get(i);
            merger.writeIndexes(rowNumConversions, closer);
            if (merger.canSkip()) {
                continue;
            }
            ColumnDescriptor columnDesc = merger.makeColumnDescriptor();
            makeColumn(v9Smoosher, mergedDimensions.get(i), columnDesc);
        }
        progress.stopSection(section);
        /************* Make index.drd & metadata.drd files **************/
        progress.progress();
        makeIndexBinary(v9Smoosher, adapters, outDir, mergedDimensions, mergedMetrics, progress, indexSpec, mergers);
        makeMetadataBinary(v9Smoosher, progress, segmentMetadata);
        v9Smoosher.close();
        progress.stop();
        return outDir;
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}
Also used : ArrayList(java.util.ArrayList) IOPeon(io.druid.segment.data.IOPeon) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) TmpFileIOPeon(io.druid.segment.data.TmpFileIOPeon) FileSmoosher(io.druid.java.util.common.io.smoosh.FileSmoosher) Closer(com.google.common.io.Closer) ValueType(io.druid.segment.column.ValueType) MMappedQueryableSegmentizerFactory(io.druid.segment.loading.MMappedQueryableSegmentizerFactory) ColumnDescriptor(io.druid.segment.column.ColumnDescriptor) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) FileOutputStream(java.io.FileOutputStream) IntBuffer(java.nio.IntBuffer) File(java.io.File) Map(java.util.Map) ColumnCapabilitiesImpl(io.druid.segment.column.ColumnCapabilitiesImpl)

Aggregations

TmpFileIOPeon (io.druid.segment.data.TmpFileIOPeon)5 File (java.io.File)5 FileOutputStream (java.io.FileOutputStream)4 Map (java.util.Map)4 ByteSink (com.google.common.io.ByteSink)3 CompressedObjectStrategy (io.druid.segment.data.CompressedObjectStrategy)3 IOPeon (io.druid.segment.data.IOPeon)3 IOException (java.io.IOException)3 FileChannel (java.nio.channels.FileChannel)3 Closer (com.google.common.io.Closer)2 BenchmarkColumnSchema (io.druid.benchmark.datagen.BenchmarkColumnSchema)2 BenchmarkColumnValueGenerator (io.druid.benchmark.datagen.BenchmarkColumnValueGenerator)2 FileSmoosher (io.druid.java.util.common.io.smoosh.FileSmoosher)2 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)2 ColumnCapabilitiesImpl (io.druid.segment.column.ColumnCapabilitiesImpl)2 ValueType (io.druid.segment.column.ValueType)2 CompressionFactory (io.druid.segment.data.CompressionFactory)2 LongSupplierSerializer (io.druid.segment.data.LongSupplierSerializer)2 BufferedReader (java.io.BufferedReader)2 BufferedWriter (java.io.BufferedWriter)2