Search in sources :

Example 6 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieMergeHandle<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> mergeHandle) throws IOException {
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), readerIterator, new UpdateHandler(mergeHandle), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 7 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieKeyLookupHandle method getBloomFilter.

private BloomFilter getBloomFilter() {
    BloomFilter bloomFilter = null;
    HoodieTimer timer = new HoodieTimer().startTimer();
    try {
        if (config.isMetadataBloomFilterIndexEnabled()) {
            bloomFilter = hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).orElseThrow(() -> new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight()));
        } else {
            try (HoodieFileReader reader = createNewFileReader()) {
                bloomFilter = reader.readBloomFilter();
            }
        }
    } catch (IOException e) {
        throw new HoodieIndexException(String.format("Error reading bloom filter from %s", getPartitionPathFileIDPair()), e);
    }
    LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFileIDPair, timer.endTimer()));
    return bloomFilter;
}
Also used : HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 8 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieIndexUtils method filterKeysFromFile.

/**
 * Given a list of row keys and one file, return only row keys existing in that file.
 *
 * @param filePath            - File to filter keys from
 * @param candidateRecordKeys - Candidate keys to filter
 * @return List of candidate keys that are available in the file
 */
public static List<String> filterKeysFromFile(Path filePath, List<String> candidateRecordKeys, Configuration configuration) throws HoodieIndexException {
    ValidationUtils.checkArgument(FSUtils.isBaseFile(filePath));
    List<String> foundRecordKeys = new ArrayList<>();
    try {
        // Load all rowKeys from the file, to double-confirm
        if (!candidateRecordKeys.isEmpty()) {
            HoodieTimer timer = new HoodieTimer().startTimer();
            HoodieFileReader fileReader = HoodieFileReaderFactory.getFileReader(configuration, filePath);
            Set<String> fileRowKeys = fileReader.filterRowKeys(new TreeSet<>(candidateRecordKeys));
            foundRecordKeys.addAll(fileRowKeys);
            LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
            if (LOG.isDebugEnabled()) {
                LOG.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
            }
        }
    } catch (Exception e) {
        throw new HoodieIndexException("Error checking candidate keys against file.", e);
    }
    return foundRecordKeys;
}
Also used : ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException)

Example 9 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class JavaMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> upsertHandle) throws IOException {
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> mergeHandle = upsertHandle;
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        });
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer)

Example 10 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class JavaExecutionStrategy method readRecordsForGroupWithLogs.

/**
 * Read records from baseFiles and apply updates.
 */
private List<HoodieRecord<T>> readRecordsForGroupWithLogs(List<ClusteringOperation> clusteringOps, String instantTime) {
    HoodieWriteConfig config = getWriteConfig();
    HoodieTable table = getHoodieTable();
    List<HoodieRecord<T>> records = new ArrayList<>();
    clusteringOps.forEach(clusteringOp -> {
        long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new JavaTaskContextSupplier(), config);
        LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction);
        try {
            Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
            HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(table.getMetaClient().getFs()).withBasePath(table.getMetaClient().getBasePath()).withLogFilePaths(clusteringOp.getDeltaFilePaths()).withReaderSchema(readerSchema).withLatestInstantTime(instantTime).withMaxMemorySizeInBytes(maxMemoryPerCompaction).withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()).withReverseReader(config.getCompactionReverseLogReadEnabled()).withBufferSize(config.getMaxDFSStreamBufferSize()).withSpillableMapBasePath(config.getSpillableMapBasePath()).withPartition(clusteringOp.getPartitionPath()).build();
            Option<HoodieFileReader> baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())));
            HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
            Iterator<HoodieRecord<T>> fileSliceReader = getFileSliceReader(baseFileReader, scanner, readerSchema, tableConfig.getPayloadClass(), tableConfig.getPreCombineField(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp())));
            fileSliceReader.forEachRemaining(records::add);
        } catch (IOException e) {
            throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e);
        }
    });
    return records;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) IOException(java.io.IOException) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) HoodieTable(org.apache.hudi.table.HoodieTable) JavaTaskContextSupplier(org.apache.hudi.client.common.JavaTaskContextSupplier)

Aggregations

HoodieFileReader (org.apache.hudi.io.storage.HoodieFileReader)15 IOException (java.io.IOException)13 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)10 Schema (org.apache.avro.Schema)9 GenericRecord (org.apache.avro.generic.GenericRecord)8 Option (org.apache.hudi.common.util.Option)8 HoodieFileReaderFactory (org.apache.hudi.io.storage.HoodieFileReaderFactory)8 ArrayList (java.util.ArrayList)7 List (java.util.List)7 Path (org.apache.hadoop.fs.Path)7 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Iterator (java.util.Iterator)6 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)6 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 IndexedRecord (org.apache.avro.generic.IndexedRecord)5 Pair (org.apache.hudi.common.util.collection.Pair)5 HoodieException (org.apache.hudi.exception.HoodieException)5 Stream (java.util.stream.Stream)4 HoodieData (org.apache.hudi.common.data.HoodieData)4