Search in sources :

Example 1 with HoodieAvroDataBlock

use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.

the class HoodieTimelineArchiver method writeToFile.

private void writeToFile(Schema wrapperSchema, List<IndexedRecord> records) throws Exception {
    if (records.size() > 0) {
        Map<HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString());
        final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp();
        HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header, keyField);
        writer.appendBlock(block);
        records.clear();
    }
}
Also used : HashMap(java.util.HashMap) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock)

Example 2 with HoodieAvroDataBlock

use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.

the class HoodieArchivedTimeline method loadInstants.

/**
 * This is method to read selected instants. Do NOT use this directly use one of the helper methods above
 * If loadInstantDetails is set to true, this would also update 'readCommits' map with commit details
 * If filter is specified, only the filtered instants are loaded
 * If commitsFilter is specified, only the filtered records are loaded
 */
private List<HoodieInstant> loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, Function<GenericRecord, Boolean> commitsFilter) {
    try {
        // List all files
        FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
        // Sort files by version suffix in reverse (implies reverse chronological order)
        Arrays.sort(fsStatuses, new ArchiveFileVersionComparator());
        Set<HoodieInstant> instantsInRange = new HashSet<>();
        for (FileStatus fs : fsStatuses) {
            // Read the archived file
            try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) {
                int instantsInPreviousFile = instantsInRange.size();
                // Read the avro blocks
                while (reader.hasNext()) {
                    HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
                    // (such as startTime, endTime of records in the block)
                    try (ClosableIterator<IndexedRecord> itr = blk.getRecordItr()) {
                        StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true).filter(r -> commitsFilter.apply((GenericRecord) r)).map(r -> readCommit((GenericRecord) r, loadInstantDetails)).filter(c -> filter == null || filter.isInRange(c)).forEach(instantsInRange::add);
                    }
                }
                if (filter != null) {
                    int instantsInCurrentFile = instantsInRange.size() - instantsInPreviousFile;
                    if (instantsInPreviousFile > 0 && instantsInCurrentFile == 0) {
                        // This signals we crossed lower bound of desired time window.
                        break;
                    }
                }
            } catch (Exception originalException) {
                // need to ignore this kind of exception here.
                try {
                    Path planPath = new Path(metaClient.getArchivePath(), MERGE_ARCHIVE_PLAN_NAME);
                    HoodieWrapperFileSystem fileSystem = metaClient.getFs();
                    if (fileSystem.exists(planPath)) {
                        HoodieMergeArchiveFilePlan plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fileSystem, planPath).get(), HoodieMergeArchiveFilePlan.class);
                        String mergedArchiveFileName = plan.getMergedArchiveFileName();
                        if (!StringUtils.isNullOrEmpty(mergedArchiveFileName) && fs.getPath().getName().equalsIgnoreCase(mergedArchiveFileName)) {
                            LOG.warn("Catch exception because of reading uncompleted merging archive file " + mergedArchiveFileName + ". Ignore it here.");
                            continue;
                        }
                    }
                    throw originalException;
                } catch (Exception e) {
                    // For example corrupted archive file and corrupted plan are both existed.
                    throw originalException;
                }
            }
        }
        ArrayList<HoodieInstant> result = new ArrayList<>(instantsInRange);
        Collections.sort(result);
        return result;
    } catch (IOException e) {
        throw new HoodieIOException("Could not load archived commit timeline from path " + metaClient.getArchivePath(), e);
    }
}
Also used : HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) Spliterators(java.util.Spliterators) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) Matcher(java.util.regex.Matcher) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Set(java.util.Set) IOException(java.io.IOException) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) Spliterator(java.util.Spliterator) FileStatus(org.apache.hadoop.fs.FileStatus) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) ArrayList(java.util.ArrayList) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) GenericRecord(org.apache.avro.generic.GenericRecord) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 3 with HoodieAvroDataBlock

use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.

the class HiveTestUtil method generateLogData.

private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException {
    Schema schema = getTestDataSchema(isLogSchemaSimple);
    HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(parquetFilePath));
    // Write a log file for this parquet file
    Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId()).overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
    List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
    Map<HeaderMetadataType, String> header = new HashMap<>(2);
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime());
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
    logWriter.appendBlock(dataBlock);
    logWriter.close();
    return logWriter.getLogFile();
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer)

Example 4 with HoodieAvroDataBlock

use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.

the class HoodieFlinkWriteableTestTable method appendRecordsToLogFile.

private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
    String partitionPath = groupedRecords.get(0).getPartitionPath();
    HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
    try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
        Map<HeaderMetadataType, String> header = new java.util.HashMap<>();
        header.put(HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
        header.put(HeaderMetadataType.SCHEMA, schema.toString());
        logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
            try {
                GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
                return (IndexedRecord) val;
            } catch (IOException e) {
                LOG.warn("Failed to convert record " + r.toString(), e);
                return null;
            }
        }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
        return Pair.of(partitionPath, logWriter.getLogFile());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 5 with HoodieAvroDataBlock

use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.

the class InputFormatTestUtil method writeDataBlockToLogFile.

public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, FileSystem fs, Schema schema, String fileId, String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion, HoodieLogBlock.HoodieLogBlockType logBlockType) throws InterruptedException, IOException {
    HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId).withLogVersion(logVersion).withRolloverLogWriteToken("1-0-1").overBaseCommit(baseCommit).withFs(fs).build();
    List<IndexedRecord> records = new ArrayList<>();
    for (int i = offset; i < offset + numberOfRecords; i++) {
        records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0"));
    }
    Schema writeSchema = records.get(0).getSchema();
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit);
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString());
    HoodieDataBlock dataBlock = null;
    if (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) {
        dataBlock = new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ);
    } else if (logBlockType == HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK) {
        dataBlock = new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP);
    } else {
        dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
    }
    writer.appendBlock(dataBlock);
    return writer;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat)

Aggregations

HoodieAvroDataBlock (org.apache.hudi.common.table.log.block.HoodieAvroDataBlock)16 IndexedRecord (org.apache.avro.generic.IndexedRecord)14 HashMap (java.util.HashMap)12 ArrayList (java.util.ArrayList)10 Path (org.apache.hadoop.fs.Path)10 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)10 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)9 IOException (java.io.IOException)8 Schema (org.apache.avro.Schema)8 List (java.util.List)7 GenericRecord (org.apache.avro.generic.GenericRecord)7 Collectors (java.util.stream.Collectors)6 FileStatus (org.apache.hadoop.fs.FileStatus)6 Option (org.apache.hudi.common.util.Option)6 Map (java.util.Map)5 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)5 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)5 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 HoodieArchivedMetaEntry (org.apache.hudi.avro.model.HoodieArchivedMetaEntry)4