Search in sources :

Example 1 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class OrcFileTail method readFrom.

public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
    OrcFileTail orcFileTail = new OrcFileTail();
    // 
    // Read the file tail:
    // 
    // variable: Footer
    // variable: Metadata
    // variable: PostScript - contains length of footer and metadata
    // 1 byte: postScriptSize
    // figure out the size of the file using the option or filesystem
    long size = orcDataSource.getSize();
    if (size <= PostScript.MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
    Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
    if (postScriptSize >= buffer.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
    // decode the post script
    try {
        orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
    validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
    int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = orcFileTail.postScript.getCompression();
    orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
    validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
    int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
    int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length()) {
        // initial read was not large enough, so just read again with the correct size
        completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
    }
    // read metadata
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
    }
    // read footer
    Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
    try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
    }
    if (orcFileTail.footer.getTypes().size() == 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
    }
    validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
    validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
    if (writeValidation.isPresent()) {
        writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
        writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
    }
    return orcFileTail;
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) CompressionKind(io.prestosql.orc.metadata.CompressionKind) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) MetadataReader(io.prestosql.orc.metadata.MetadataReader) PostScript(io.prestosql.orc.metadata.PostScript) Slice(io.airlift.slice.Slice) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader)

Example 2 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class StripeReader method readBloomFilterIndexes.

private Map<OrcColumnId, List<HashableBloomFilter>> readBloomFilterIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, StripeInformation stripe) throws IOException {
    HashMap<OrcColumnId, List<HashableBloomFilter>> bloomFilters = new HashMap<>();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER_UTF8) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            if (orcCacheProperties.isBloomFilterCacheEnabled()) {
                OrcBloomFilterCacheKey bloomFilterCacheKey = new OrcBloomFilterCacheKey();
                bloomFilterCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                bloomFilterCacheKey.setStripeOffset(stripe.getOffset());
                bloomFilterCacheKey.setStreamId(entry.getKey());
                try {
                    bloomFilters.put(stream.getColumnId(), orcCacheStore.getBloomFiltersCache().get(bloomFilterCacheKey, () -> metadataReader.readBloomFilterIndexes(inputStream)));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching bloom filters. Falling back to default flow");
                    bloomFilters.put(stream.getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
                }
            } else {
                bloomFilters.put(stream.getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
            }
        }
    }
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER && !bloomFilters.containsKey(stream.getColumnId())) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            if (orcCacheProperties.isBloomFilterCacheEnabled()) {
                OrcBloomFilterCacheKey bloomFilterCacheKey = new OrcBloomFilterCacheKey();
                bloomFilterCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                bloomFilterCacheKey.setStripeOffset(stripe.getOffset());
                bloomFilterCacheKey.setStreamId(entry.getKey());
                try {
                    bloomFilters.put(entry.getKey().getColumnId(), orcCacheStore.getBloomFiltersCache().get(bloomFilterCacheKey, () -> metadataReader.readBloomFilterIndexes(inputStream)));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching bloom filters. Falling back to default flow");
                    bloomFilters.put(entry.getKey().getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
                }
            } else {
                bloomFilters.put(entry.getKey().getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
            }
        }
    }
    return ImmutableMap.copyOf(bloomFilters);
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException)

Example 3 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class TestAbstractNumbericColumnReader method testTypeCoercionDate.

@Test
public void testTypeCoercionDate() throws OrcCorruptionException {
    OrcColumn column = new OrcColumn("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0", new OrcColumnId(3), "cs_order_number", OrcType.OrcTypeKind.DATE, new OrcDataSourceId("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0"), ImmutableList.of());
    ColumnReader actualDateColumnReader = ColumnReaders.createColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext(), null);
    DateColumnReader expectedDateColumnReader = new DateColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext().newLocalMemoryContext(ColumnReaders.class.getSimpleName()));
    assertEquals(actualDateColumnReader.toString(), expectedDateColumnReader.toString());
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) DateColumnReader(io.prestosql.orc.reader.DateColumnReader) ShortColumnReader(io.prestosql.orc.reader.ShortColumnReader) DateColumnReader(io.prestosql.orc.reader.DateColumnReader) LongColumnReader(io.prestosql.orc.reader.LongColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) IntegerColumnReader(io.prestosql.orc.reader.IntegerColumnReader) Test(org.testng.annotations.Test)

Example 4 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class TestAbstractNumbericColumnReader method testTypeCoercionBigInt.

@Test
public void testTypeCoercionBigInt() throws OrcCorruptionException {
    OrcColumn column = new OrcColumn("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0", new OrcColumnId(3), "cs_order_number", OrcType.OrcTypeKind.LONG, new OrcDataSourceId("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0"), ImmutableList.of());
    ColumnReader actualLongColumnReader = ColumnReaders.createColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext(), null);
    LongColumnReader expectedLongColumnReader = new LongColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext().newLocalMemoryContext(ColumnReaders.class.getSimpleName()));
    assertEquals(actualLongColumnReader.toString(), expectedLongColumnReader.toString());
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) LongColumnReader(io.prestosql.orc.reader.LongColumnReader) ShortColumnReader(io.prestosql.orc.reader.ShortColumnReader) DateColumnReader(io.prestosql.orc.reader.DateColumnReader) LongColumnReader(io.prestosql.orc.reader.LongColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) IntegerColumnReader(io.prestosql.orc.reader.IntegerColumnReader) Test(org.testng.annotations.Test)

Example 5 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class MapColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null);
    rowGroupColumnStatistics.add(statistics);
    nonNullValueCount = 0;
    ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
    columnStatistics.put(columnId, statistics);
    columnStatistics.putAll(keyWriter.finishRowGroup());
    columnStatistics.putAll(valueWriter.finishRowGroup());
    return columnStatistics.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)23 Stream (io.prestosql.orc.metadata.Stream)9 ImmutableMap (com.google.common.collect.ImmutableMap)8 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)7 Test (org.testng.annotations.Test)7 ArrayList (java.util.ArrayList)6 List (java.util.List)6 Slice (io.airlift.slice.Slice)5 ImmutableList (com.google.common.collect.ImmutableList)4 CompressionKind (io.prestosql.orc.metadata.CompressionKind)4 ColumnReader (io.prestosql.orc.reader.ColumnReader)4 DateColumnReader (io.prestosql.orc.reader.DateColumnReader)4 IntegerColumnReader (io.prestosql.orc.reader.IntegerColumnReader)4 LongColumnReader (io.prestosql.orc.reader.LongColumnReader)4 ShortColumnReader (io.prestosql.orc.reader.ShortColumnReader)4 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)3 StripeFooter (io.prestosql.orc.metadata.StripeFooter)3