use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.
the class CompactionHiveRegistrationAction method onCompactionJobComplete.
public void onCompactionJobComplete(FileSystemDataset dataset) throws IOException {
if (state.contains(ConfigurationKeys.HIVE_REGISTRATION_POLICY)) {
HiveRegister hiveRegister = HiveRegister.get(state);
HiveRegistrationPolicy hiveRegistrationPolicy = HiveRegistrationPolicyBase.getPolicy(state);
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
List<String> paths = new ArrayList<>();
for (HiveSpec spec : hiveRegistrationPolicy.getHiveSpecs(new Path(result.getDstAbsoluteDir()))) {
hiveRegister.register(spec);
paths.add(spec.getPath().toUri().toASCIIString());
log.info("Hive registration is done for {}", result.getDstAbsoluteDir());
}
// submit events for hive registration
if (eventSubmitter != null) {
Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.HIVE_REGISTRATION_PATHS, Joiner.on(',').join(paths));
this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_HIVE_REGISTRATION_EVENT, eventMetadataMap);
}
}
}
use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.
the class CompactionTimeRangeVerifier method verify.
public Result verify(FileSystemDataset dataset) {
final DateTime earliest;
final DateTime latest;
try {
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
DateTime folderTime = result.getTime();
DateTimeZone timeZone = DateTimeZone.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
DateTime compactionStartTime = new DateTime(this.state.getPropAsLong(CompactionSource.COMPACTION_INIT_TIME), timeZone);
PeriodFormatter formatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours().appendSuffix("h").toFormatter();
// get earliest time
String maxTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
Period maxTimeAgo = formatter.parsePeriod(maxTimeAgoStr);
earliest = compactionStartTime.minus(maxTimeAgo);
// get latest time
String minTimeAgoStr = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
Period minTimeAgo = formatter.parsePeriod(minTimeAgoStr);
latest = compactionStartTime.minus(minTimeAgo);
if (earliest.isBefore(folderTime) && latest.isAfter(folderTime)) {
log.debug("{} falls in the user defined time range", dataset.datasetRoot());
return new Result(true, "");
}
} catch (Exception e) {
log.error("{} cannot be verified because of {}", dataset.datasetRoot(), ExceptionUtils.getFullStackTrace(e));
return new Result(false, e.toString());
}
return new Result(false, dataset.datasetRoot() + " is not in between " + earliest + " and " + latest);
}
use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.
the class CompactionCompleteFileOperationAction method onCompactionJobComplete.
/**
* Replace or append the destination folder with new avro files from map-reduce job
* Create a record count file containing the number of records that have been processed .
*/
public void onCompactionJobComplete(FileSystemDataset dataset) throws IOException {
if (configurator != null && configurator.isJobCreated()) {
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
Path tmpPath = configurator.getMrOutputPath();
Path dstPath = new Path(result.getDstAbsoluteDir());
// this is append delta mode due to the compaction rename source dir mode being enabled
boolean appendDeltaOutput = this.state.getPropAsBoolean(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED, MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);
Job job = this.configurator.getConfiguredJob();
long newTotalRecords = 0;
long oldTotalRecords = helper.readRecordCount(new Path(result.getDstAbsoluteDir()));
long executeCount = helper.readExecutionCount(new Path(result.getDstAbsoluteDir()));
List<Path> goodPaths = CompactionAvroJobConfigurator.getGoodFiles(job, tmpPath, this.fs);
if (appendDeltaOutput) {
FsPermission permission = HadoopUtils.deserializeFsPermission(this.state, MRCompactorJobRunner.COMPACTION_JOB_OUTPUT_DIR_PERMISSION, FsPermission.getDefault());
WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath, permission);
// append files under mr output to destination
for (Path filePath : goodPaths) {
String fileName = filePath.getName();
log.info(String.format("Adding %s to %s", filePath.toString(), dstPath));
Path outPath = new Path(dstPath, fileName);
if (!this.fs.rename(filePath, outPath)) {
throw new IOException(String.format("Unable to move %s to %s", filePath.toString(), outPath.toString()));
}
}
// Obtain record count from input file names.
// We don't get record count from map-reduce counter because in the next run, the threshold (delta record)
// calculation is based on the input file names. By pre-defining which input folders are involved in the
// MR execution, it is easy to track how many files are involved in MR so far, thus calculating the number of total records
// (all previous run + current run) is possible.
newTotalRecords = this.configurator.getFileNameRecordCount();
} else {
this.fs.delete(dstPath, true);
FsPermission permission = HadoopUtils.deserializeFsPermission(this.state, MRCompactorJobRunner.COMPACTION_JOB_OUTPUT_DIR_PERMISSION, FsPermission.getDefault());
WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath.getParent(), permission);
if (!this.fs.rename(tmpPath, dstPath)) {
throw new IOException(String.format("Unable to move %s to %s", tmpPath, dstPath));
}
// Obtain record count from map reduce job counter
// We don't get record count from file name because tracking which files are actually involved in the MR execution can
// be hard. This is due to new minutely data is rolled up to hourly folder but from daily compaction perspective we are not
// able to tell which file are newly added (because we simply pass all hourly folders to MR job instead of individual files).
Counter counter = job.getCounters().findCounter(AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT);
newTotalRecords = counter.getValue();
}
State compactState = helper.loadState(new Path(result.getDstAbsoluteDir()));
compactState.setProp(CompactionSlaEventHelper.RECORD_COUNT_TOTAL, Long.toString(newTotalRecords));
compactState.setProp(CompactionSlaEventHelper.EXEC_COUNT_TOTAL, Long.toString(executeCount + 1));
compactState.setProp(CompactionSlaEventHelper.MR_JOB_ID, this.configurator.getConfiguredJob().getJobID().toString());
helper.saveState(new Path(result.getDstAbsoluteDir()), compactState);
log.info("Updating record count from {} to {} in {} [{}]", oldTotalRecords, newTotalRecords, dstPath, executeCount + 1);
// submit events for record count
if (eventSubmitter != null) {
Map<String, String> eventMetadataMap = ImmutableMap.of(CompactionSlaEventHelper.DATASET_URN, dataset.datasetURN(), CompactionSlaEventHelper.RECORD_COUNT_TOTAL, Long.toString(newTotalRecords), CompactionSlaEventHelper.PREV_RECORD_COUNT_TOTAL, Long.toString(oldTotalRecords), CompactionSlaEventHelper.EXEC_COUNT_TOTAL, Long.toString(executeCount + 1), CompactionSlaEventHelper.MR_JOB_ID, this.configurator.getConfiguredJob().getJobID().toString());
this.eventSubmitter.submit(CompactionSlaEventHelper.COMPACTION_RECORD_COUNT_EVENT, eventMetadataMap);
}
}
}
use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.
the class CompactionAuditCountVerifier method verify.
/**
* Verify a specific dataset by following below steps
* 1) Retrieve a tier-to-count mapping
* 2) Read count from {@link CompactionAuditCountVerifier#gobblinTier}
* 3) Read count from all other {@link CompactionAuditCountVerifier#referenceTiers}
* 4) Compare count retrieved from steps 2) and 3), if any of (gobblin/refenence) >= threshold, return true, else return false
* @param dataset Dataset needs to be verified
* @return If verification is succeeded
*/
public Result verify(FileSystemDataset dataset) {
if (auditCountClient == null) {
log.debug("No audit count client specified, skipped");
return new Result(true, "");
}
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(this.state).parse(dataset);
DateTime startTime = result.getTime();
DateTime endTime = startTime.plusHours(1);
String datasetName = result.getDatasetName();
try {
Map<String, Long> countsByTier = auditCountClient.fetch(datasetName, startTime.getMillis(), endTime.getMillis());
for (String tier : referenceTiers) {
Result rst = passed(datasetName, countsByTier, tier);
if (rst.isSuccessful()) {
return new Result(true, "");
}
}
} catch (IOException e) {
return new Result(false, ExceptionUtils.getFullStackTrace(e));
}
return new Result(false, String.format("%s data is not complete between %s and %s", datasetName, startTime, endTime));
}
use of org.apache.gobblin.compaction.parser.CompactionPathParser in project incubator-gobblin by apache.
the class CompactionThresholdVerifier method verify.
/**
* There are two record count we are comparing here
* 1) The new record count in the input folder
* 2) The record count we compacted previously from last run
* Calculate two numbers difference and compare with a predefined threshold.
*
* (Alternatively we can save the previous record count to a state store. However each input
* folder is a dataset. We may end up with loading too many redundant job level state for each
* dataset. To avoid scalability issue, we choose a stateless approach where each dataset tracks
* record count by themselves and persist it in the file system)
*
* @return true iff the difference exceeds the threshold or this is the first time compaction
*/
public Result verify(FileSystemDataset dataset) {
Map<String, Double> thresholdMap = RecompactionConditionBasedOnRatio.getDatasetRegexAndRecompactThreshold(state.getProp(MRCompactor.COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET, StringUtils.EMPTY));
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
double threshold = RecompactionConditionBasedOnRatio.getRatioThresholdByDatasetName(result.getDatasetName(), thresholdMap);
log.debug("Threshold is {} for dataset {}", threshold, result.getDatasetName());
InputRecordCountHelper helper = new InputRecordCountHelper(state);
try {
double newRecords = helper.calculateRecordCount(Lists.newArrayList(new Path(dataset.datasetURN())));
double oldRecords = helper.readRecordCount(new Path(result.getDstAbsoluteDir()));
if (oldRecords == 0) {
return new Result(true, "");
}
if ((newRecords - oldRecords) / oldRecords > threshold) {
log.debug("Dataset {} records exceeded the threshold {}", dataset.datasetURN(), threshold);
return new Result(true, "");
}
return new Result(false, String.format("%s is failed for dataset %s. Prev=%f, Cur=%f, not reaching to threshold %f", this.getName(), result.getDatasetName(), oldRecords, newRecords, threshold));
} catch (IOException e) {
return new Result(false, ExceptionUtils.getFullStackTrace(e));
}
}
Aggregations