Search in sources :

Example 1 with RateLimiter

use of org.apache.hudi.common.util.RateLimiter in project hudi by apache.

the class SparkHoodieHBaseIndex method updateLocationFunction.

private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
    return (partition, statusIterator) -> {
        List<WriteStatus> writeStatusList = new ArrayList<>();
        // Grab the global HBase connection
        synchronized (SparkHoodieHBaseIndex.class) {
            if (hbaseConnection == null || hbaseConnection.isClosed()) {
                hbaseConnection = getHBaseConnection();
            }
        }
        final long startTimeForPutsTask = DateTime.now().getMillis();
        LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);
        try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
            final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
            while (statusIterator.hasNext()) {
                WriteStatus writeStatus = statusIterator.next();
                List<Mutation> mutations = new ArrayList<>();
                try {
                    long numOfInserts = writeStatus.getStat().getNumInserts();
                    LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
                    LOG.info("Total inserts in this job: " + this.totalNumInserts);
                    LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
                    // Any calls beyond `multiPutBatchSize` within a second will be rate limited
                    for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
                        if (!writeStatus.isErrored(rec.getKey())) {
                            Option<HoodieRecordLocation> loc = rec.getNewLocation();
                            if (loc.isPresent()) {
                                if (rec.getCurrentLocation() != null) {
                                    // This is an update, no need to update index
                                    continue;
                                }
                                Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
                                put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
                                mutations.add(put);
                            } else {
                                // Delete existing index for a deleted record
                                Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
                                mutations.add(delete);
                            }
                        }
                        if (mutations.size() < multiPutBatchSize) {
                            continue;
                        }
                        doMutations(mutator, mutations, limiter);
                    }
                    // process remaining puts and deletes, if any
                    doMutations(mutator, mutations, limiter);
                } catch (Exception e) {
                    Exception we = new Exception("Error updating index for " + writeStatus, e);
                    LOG.error(we);
                    writeStatus.setGlobalError(we);
                }
                writeStatusList.add(writeStatus);
            }
            final long endPutsTime = DateTime.now().getMillis();
            LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
        } catch (IOException e) {
            throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
        }
        return writeStatusList.iterator();
    };
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Mutation(org.apache.hadoop.hbase.client.Mutation) Function2(org.apache.spark.api.java.function.Function2) Result(org.apache.hadoop.hbase.client.Result) Date(java.util.Date) RateLimiter(org.apache.hudi.common.util.RateLimiter) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) Delete(org.apache.hadoop.hbase.client.Delete) Partitioner(org.apache.spark.Partitioner) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) Get(org.apache.hadoop.hbase.client.Get) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) SparkMemoryUtils(org.apache.hudi.client.utils.SparkMemoryUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Put(org.apache.hadoop.hbase.client.Put) SparkConf(org.apache.spark.SparkConf) DateTime(org.joda.time.DateTime) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ConnectionFactory(org.apache.hadoop.hbase.client.ConnectionFactory) Scan(org.apache.hadoop.hbase.client.Scan) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Connection(org.apache.hadoop.hbase.client.Connection) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) Delete(org.apache.hadoop.hbase.client.Delete) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) IOException(java.io.IOException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) RateLimiter(org.apache.hudi.common.util.RateLimiter) Put(org.apache.hadoop.hbase.client.Put) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) Option(org.apache.hudi.common.util.Option) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 2 with RateLimiter

use of org.apache.hudi.common.util.RateLimiter in project hudi by apache.

the class SparkHoodieHBaseIndex method rollbackCommit.

@Override
public boolean rollbackCommit(String instantTime) {
    int multiGetBatchSize = config.getHbaseIndexGetBatchSize();
    boolean rollbackSync = config.getHBaseIndexRollbackSync();
    if (!config.getHBaseIndexRollbackSync()) {
        // Default Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()}
        return true;
    }
    synchronized (SparkHoodieHBaseIndex.class) {
        if (hbaseConnection == null || hbaseConnection.isClosed()) {
            hbaseConnection = getHBaseConnection();
        }
    }
    try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName));
        BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
        final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
        Long rollbackTime = HoodieActiveTimeline.parseDateFromInstantTime(instantTime).getTime();
        Long currentTime = new Date().getTime();
        Scan scan = new Scan();
        scan.addFamily(SYSTEM_COLUMN_FAMILY);
        scan.setTimeRange(rollbackTime, currentTime);
        ResultScanner scanner = hTable.getScanner(scan);
        Iterator<Result> scannerIterator = scanner.iterator();
        List<Get> statements = new ArrayList<>();
        List<Result> currentVersionResults = new ArrayList<Result>();
        List<Mutation> mutations = new ArrayList<>();
        while (scannerIterator.hasNext()) {
            Result result = scannerIterator.next();
            currentVersionResults.add(result);
            statements.add(generateStatement(Bytes.toString(result.getRow()), 0L, rollbackTime - 1));
            if (scannerIterator.hasNext() && statements.size() < multiGetBatchSize) {
                continue;
            }
            Result[] lastVersionResults = hTable.get(statements);
            for (int i = 0; i < lastVersionResults.length; i++) {
                Result lastVersionResult = lastVersionResults[i];
                if (null == lastVersionResult.getRow() && rollbackSync) {
                    Result currentVersionResult = currentVersionResults.get(i);
                    Delete delete = new Delete(currentVersionResult.getRow());
                    mutations.add(delete);
                }
                if (null != lastVersionResult.getRow()) {
                    String oldPath = new String(lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
                    String nowPath = new String(currentVersionResults.get(i).getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
                    if (!oldPath.equals(nowPath) || rollbackSync) {
                        Put put = new Put(lastVersionResult.getRow());
                        put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
                        put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
                        put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
                        mutations.add(put);
                    }
                }
            }
            doMutations(mutator, mutations, limiter);
            currentVersionResults.clear();
            statements.clear();
            mutations.clear();
        }
    } catch (Exception e) {
        LOG.error("hbase index roll back failed", e);
        return false;
    }
    return true;
}
Also used : Delete(org.apache.hadoop.hbase.client.Delete) ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) ArrayList(java.util.ArrayList) HTable(org.apache.hadoop.hbase.client.HTable) RateLimiter(org.apache.hudi.common.util.RateLimiter) Date(java.util.Date) Put(org.apache.hadoop.hbase.client.Put) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) Result(org.apache.hadoop.hbase.client.Result) Get(org.apache.hadoop.hbase.client.Get) Scan(org.apache.hadoop.hbase.client.Scan) Mutation(org.apache.hadoop.hbase.client.Mutation)

Aggregations

IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Date (java.util.Date)2 BufferedMutator (org.apache.hadoop.hbase.client.BufferedMutator)2 Delete (org.apache.hadoop.hbase.client.Delete)2 Get (org.apache.hadoop.hbase.client.Get)2 HTable (org.apache.hadoop.hbase.client.HTable)2 Mutation (org.apache.hadoop.hbase.client.Mutation)2 Put (org.apache.hadoop.hbase.client.Put)2 Result (org.apache.hadoop.hbase.client.Result)2 ResultScanner (org.apache.hadoop.hbase.client.ResultScanner)2 Scan (org.apache.hadoop.hbase.client.Scan)2 Serializable (java.io.Serializable)1 HashMap (java.util.HashMap)1 Iterator (java.util.Iterator)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 TimeUnit (java.util.concurrent.TimeUnit)1 Configuration (org.apache.hadoop.conf.Configuration)1