Search in sources :

Example 1 with CompactionPathParser

use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.

the class CompactionHiveRegistrationAction method onCompactionJobComplete.

public void onCompactionJobComplete(FileSystemDataset dataset) throws IOException {
    if (state.contains(ConfigurationKeys.HIVE_REGISTRATION_POLICY)) {
        HiveRegister hiveRegister = HiveRegister.get(state);
        HiveRegistrationPolicy hiveRegistrationPolicy = HiveRegistrationPolicyBase.getPolicy(state);
        CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
        List<String> paths = new ArrayList<>();
        for (HiveSpec spec : hiveRegistrationPolicy.getHiveSpecs(new Path(result.getDstAbsoluteDir()))) {
            hiveRegister.register(spec);
            paths.add(spec.getPath().toUri().toASCIIString());
            log.info("Hive registration is done for {}", result.getDstAbsoluteDir());
        }
        // submit events for hive registration
        if (eventSubmitter != null) {
            Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.HIVE_REGISTRATION_PATHS, Joiner.on(',').join(paths));
            this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_HIVE_REGISTRATION_EVENT, eventMetadataMap);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveRegister(org.apache.gobblin.hive.HiveRegister) ArrayList(java.util.ArrayList) CompactionPathParser(org.apache.gobblin.compaction.parser.CompactionPathParser) HiveRegistrationPolicy(org.apache.gobblin.hive.policy.HiveRegistrationPolicy) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec)

Example 2 with CompactionPathParser

use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.

the class CompactionTimeRangeVerifier method verify.

public Result verify(FileSystemDataset dataset) {
    final DateTime earliest;
    final DateTime latest;
    try {
        CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
        DateTime folderTime = result.getTime();
        DateTimeZone timeZone = DateTimeZone.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
        DateTime compactionStartTime = new DateTime(this.state.getPropAsLong(CompactionSource.COMPACTION_INIT_TIME), timeZone);
        PeriodFormatter formatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours().appendSuffix("h").toFormatter();
        // get earliest time
        String maxTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
        Period maxTimeAgo = formatter.parsePeriod(maxTimeAgoStr);
        earliest = compactionStartTime.minus(maxTimeAgo);
        // get latest time
        String minTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
        Period minTimeAgo = formatter.parsePeriod(minTimeAgoStr);
        latest = compactionStartTime.minus(minTimeAgo);
        if (earliest.isBefore(folderTime) && latest.isAfter(folderTime)) {
            log.debug("{} falls in the user defined time range", dataset.datasetRoot());
            return new Result(true, "");
        }
    } catch (Exception e) {
        log.error("{} cannot be verified because of {}", dataset.datasetRoot(), ExceptionUtils.getFullStackTrace(e));
        return new Result(false, e.toString());
    }
    return new Result(false, dataset.datasetRoot() + " is not in between " + earliest + " and " + latest);
}
Also used : PeriodFormatterBuilder(org.joda.time.format.PeriodFormatterBuilder) PeriodFormatter(org.joda.time.format.PeriodFormatter) Period(org.joda.time.Period) CompactionPathParser(org.apache.gobblin.compaction.parser.CompactionPathParser) DateTime(org.joda.time.DateTime) DateTimeZone(org.joda.time.DateTimeZone)

Example 3 with CompactionPathParser

use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.

the class CompactionCompleteFileOperationAction method onCompactionJobComplete.

/**
 * Replace or append the destination folder with new avro files from map-reduce job
 * Create a record count file containing the number of records that have been processed .
 */
public void onCompactionJobComplete(FileSystemDataset dataset) throws IOException {
    if (configurator != null && configurator.isJobCreated()) {
        CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
        Path tmpPath = configurator.getMrOutputPath();
        Path dstPath = new Path(result.getDstAbsoluteDir());
        // this is append delta mode due to the compaction rename source dir mode being enabled
        boolean appendDeltaOutput = this.state.getPropAsBoolean(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED, MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);
        Job job = this.configurator.getConfiguredJob();
        long newTotalRecords = 0;
        long oldTotalRecords = helper.readRecordCount(new Path(result.getDstAbsoluteDir()));
        long executeCount = helper.readExecutionCount(new Path(result.getDstAbsoluteDir()));
        List<Path> goodPaths = CompactionAvroJobConfigurator.getGoodFiles(job, tmpPath, this.fs);
        if (appendDeltaOutput) {
            FsPermission permission = HadoopUtils.deserializeFsPermission(this.state, MRCompactorJobRunner.COMPACTION_JOB_OUTPUT_DIR_PERMISSION, FsPermission.getDefault());
            WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath, permission);
            // append files under mr output to destination
            for (Path filePath : goodPaths) {
                String fileName = filePath.getName();
                log.info(String.format("Adding %s to %s", filePath.toString(), dstPath));
                Path outPath = new Path(dstPath, fileName);
                if (!this.fs.rename(filePath, outPath)) {
                    throw new IOException(String.format("Unable to move %s to %s", filePath.toString(), outPath.toString()));
                }
            }
            // Obtain record count from input file names.
            // We don't get record count from map-reduce counter because in the next run, the threshold (delta record)
            // calculation is based on the input file names. By pre-defining which input folders are involved in the
            // MR execution, it is easy to track how many files are involved in MR so far, thus calculating the number of total records
            // (all previous run + current run) is possible.
            newTotalRecords = this.configurator.getFileNameRecordCount();
        } else {
            this.fs.delete(dstPath, true);
            FsPermission permission = HadoopUtils.deserializeFsPermission(this.state, MRCompactorJobRunner.COMPACTION_JOB_OUTPUT_DIR_PERMISSION, FsPermission.getDefault());
            WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath.getParent(), permission);
            if (!this.fs.rename(tmpPath, dstPath)) {
                throw new IOException(String.format("Unable to move %s to %s", tmpPath, dstPath));
            }
            // Obtain record count from map reduce job counter
            // We don't get record count from file name because tracking which files are actually involved in the MR execution can
            // be hard. This is due to new minutely data is rolled up to hourly folder but from daily compaction perspective we are not
            // able to tell which file are newly added (because we simply pass all hourly folders to MR job instead of individual files).
            Counter counter = job.getCounters().findCounter(AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT);
            newTotalRecords = counter.getValue();
        }
        State compactState = helper.loadState(new Path(result.getDstAbsoluteDir()));
        compactState.setProp(CompactionSlaEventHelper.RECORD_COUNT_TOTAL, Long.toString(newTotalRecords));
        compactState.setProp(CompactionSlaEventHelper.EXEC_COUNT_TOTAL, Long.toString(executeCount + 1));
        compactState.setProp(CompactionSlaEventHelper.MR_JOB_ID, this.configurator.getConfiguredJob().getJobID().toString());
        helper.saveState(new Path(result.getDstAbsoluteDir()), compactState);
        log.info("Updating record count from {} to {} in {} [{}]", oldTotalRecords, newTotalRecords, dstPath, executeCount + 1);
        // submit events for record count
        if (eventSubmitter != null) {
            Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.RECORD_COUNT_TOTAL, Long.toString(newTotalRecords), CompactionSlaEventHelper.PREV_RECORD_COUNT_TOTAL, Long.toString(oldTotalRecords), CompactionSlaEventHelper.EXEC_COUNT_TOTAL, Long.toString(executeCount + 1), CompactionSlaEventHelper.MR_JOB_ID, this.configurator.getConfiguredJob().getJobID().toString());
            this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_RECORD_COUNT_EVENT, eventMetadataMap);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) Counter(org.apache.hadoop.mapreduce.Counter) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) CompactionPathParser(org.apache.gobblin.compaction.parser.CompactionPathParser) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Job(org.apache.hadoop.mapreduce.Job)

Example 4 with CompactionPathParser

use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.

the class CompactionAuditCountVerifier method verify.

/**
 * Verify a specific dataset by following below steps
 *    1) Retrieve a tier-to-count mapping
 *    2) Read count from {@link CompactionAuditCountVerifier#gobblinTier}
 *    3) Read count from all other {@link CompactionAuditCountVerifier#referenceTiers}
 *    4) Compare count retrieved from steps 2) and 3), if any of (gobblin/refenence) >= threshold, return true, else return false
 * @param dataset Dataset needs to be verified
 * @return If verification is succeeded
 */
public Result verify(FileSystemDataset dataset) {
    if (auditCountClient == null) {
        log.debug("No audit count client specified, skipped");
        return new Result(true, "");
    }
    CompactionPathParser.CompactionParserResult result = new CompactionPathParser(this.state).parse(dataset);
    DateTime startTime = result.getTime();
    DateTime endTime = startTime.plusHours(1);
    String datasetName = result.getDatasetName();
    try {
        Map<String, Long> countsByTier = auditCountClient.fetch(datasetName, startTime.getMillis(), endTime.getMillis());
        for (String tier : referenceTiers) {
            Result rst = passed(datasetName, countsByTier, tier);
            if (rst.isSuccessful()) {
                return new Result(true, "");
            }
        }
    } catch (IOException e) {
        return new Result(false, ExceptionUtils.getFullStackTrace(e));
    }
    return new Result(false, String.format("%s data is not complete between %s and %s", datasetName, startTime, endTime));
}
Also used : CompactionPathParser(org.apache.gobblin.compaction.parser.CompactionPathParser) IOException(java.io.IOException) DateTime(org.joda.time.DateTime)

Example 5 with CompactionPathParser

use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.

the class CompactionThresholdVerifier method verify.

/**
 * There are two record count we are comparing here
 *    1) The new record count in the input folder
 *    2) The record count we compacted previously from last run
 * Calculate two numbers difference and compare with a predefined threshold.
 *
 * (Alternatively we can save the previous record count to a state store. However each input
 * folder is a dataset. We may end up with loading too many redundant job level state for each
 * dataset. To avoid scalability issue, we choose a stateless approach where each dataset tracks
 * record count by themselves and persist it in the file system)
 *
 * @return true iff the difference exceeds the threshold or this is the first time compaction
 */
public Result verify(FileSystemDataset dataset) {
    Map<String, Double> thresholdMap = RecompactionConditionBasedOnRatio.getDatasetRegexAndRecompactThreshold(state.getProp(MRCompactor.COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET, StringUtils.EMPTY));
    CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
    double threshold = RecompactionConditionBasedOnRatio.getRatioThresholdByDatasetName(result.getDatasetName(), thresholdMap);
    log.debug("Threshold is {} for dataset {}", threshold, result.getDatasetName());
    InputRecordCountHelper helper = new InputRecordCountHelper(state);
    try {
        double newRecords = helper.calculateRecordCount(Lists.newArrayList(new Path(dataset.datasetURN())));
        double oldRecords = helper.readRecordCount(new Path(result.getDstAbsoluteDir()));
        if (oldRecords == 0) {
            return new Result(true, "");
        }
        if ((newRecords - oldRecords) / oldRecords > threshold) {
            log.debug("Dataset {} records exceeded the threshold {}", dataset.datasetURN(), threshold);
            return new Result(true, "");
        }
        return new Result(false, String.format("%s is failed for dataset %s. Prev=%f, Cur=%f, not reaching to threshold %f", this.getName(), result.getDatasetName(), oldRecords, newRecords, threshold));
    } catch (IOException e) {
        return new Result(false, ExceptionUtils.getFullStackTrace(e));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CompactionPathParser(org.apache.gobblin.compaction.parser.CompactionPathParser) IOException(java.io.IOException)

Aggregations

CompactionPathParser (org.apache.gobblin.compaction.parser.CompactionPathParser)6 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 DateTime (org.joda.time.DateTime)2 ArrayList (java.util.ArrayList)1 State (org.apache.gobblin.configuration.State)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 HiveRegister (org.apache.gobblin.hive.HiveRegister)1 HiveRegistrationPolicy (org.apache.gobblin.hive.policy.HiveRegistrationPolicy)1 HiveSpec (org.apache.gobblin.hive.spec.HiveSpec)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1 Counter (org.apache.hadoop.mapreduce.Counter)1 Job (org.apache.hadoop.mapreduce.Job)1 DateTimeZone (org.joda.time.DateTimeZone)1 Period (org.joda.time.Period)1 PeriodFormatter (org.joda.time.format.PeriodFormatter)1 PeriodFormatterBuilder (org.joda.time.format.PeriodFormatterBuilder)1