use of org.apache.gobblin.util.filters.AndPathFilter in project incubator-gobblin by apache.
the class FsSpecConsumer method changedSpecs.
/**
* List of newly changed {@link Spec}s for execution on {@link SpecExecutor}.
* The {@link Spec}s are returned in the increasing order of their modification times.
*/
@Override
public Future<? extends List<Pair<SpecExecutor.Verb, Spec>>> changedSpecs() {
List<Pair<SpecExecutor.Verb, Spec>> specList = new ArrayList<>();
FileStatus[] fileStatuses;
try {
fileStatuses = this.fs.listStatus(this.specDirPath, new AndPathFilter(new HiddenFilter(), new AvroUtils.AvroPathFilter()));
} catch (IOException e) {
log.error("Error when listing files at path: {}", this.specDirPath.toString(), e);
return null;
}
log.info("Found {} files at path {}", fileStatuses.length, this.specDirPath.toString());
// Sort the {@link JobSpec}s in increasing order of their modification times.
// This is done so that the {JobSpec}s can be handled in FIFO order by the
// JobConfigurationManager and eventually, the GobblinHelixJobScheduler.
Arrays.sort(fileStatuses, Comparator.comparingLong(FileStatus::getModificationTime));
for (FileStatus fileStatus : fileStatuses) {
DataFileReader<AvroJobSpec> dataFileReader;
try {
dataFileReader = new DataFileReader<>(new FsInput(fileStatus.getPath(), this.fs.getConf()), new SpecificDatumReader<>());
} catch (IOException e) {
log.error("Error creating DataFileReader for: {}", fileStatus.getPath().toString(), e);
continue;
}
AvroJobSpec avroJobSpec = null;
while (dataFileReader.hasNext()) {
avroJobSpec = dataFileReader.next();
break;
}
if (avroJobSpec != null) {
JobSpec.Builder jobSpecBuilder = new JobSpec.Builder(avroJobSpec.getUri());
Properties props = new Properties();
props.putAll(avroJobSpec.getProperties());
jobSpecBuilder.withJobCatalogURI(avroJobSpec.getUri()).withVersion(avroJobSpec.getVersion()).withDescription(avroJobSpec.getDescription()).withConfigAsProperties(props).withConfig(ConfigUtils.propertiesToConfig(props));
try {
if (!avroJobSpec.getTemplateUri().isEmpty()) {
jobSpecBuilder.withTemplate(new URI(avroJobSpec.getTemplateUri()));
}
} catch (URISyntaxException u) {
log.error("Error building a job spec: ", u);
continue;
}
String verbName = avroJobSpec.getMetadata().get(SpecExecutor.VERB_KEY);
SpecExecutor.Verb verb = SpecExecutor.Verb.valueOf(verbName);
JobSpec jobSpec = jobSpecBuilder.build();
log.debug("Successfully built jobspec: {}", jobSpec.getUri().toString());
specList.add(new ImmutablePair<SpecExecutor.Verb, Spec>(verb, jobSpec));
this.specToPathMap.put(jobSpec.getUri(), fileStatus.getPath());
}
}
return new CompletedFuture<>(specList, null);
}
use of org.apache.gobblin.util.filters.AndPathFilter in project incubator-gobblin by apache.
the class RecursivePathFinder method getPaths.
public Set<FileStatus> getPaths(boolean skipHiddenPaths) throws IOException {
if (!this.fs.exists(this.rootPath)) {
return Sets.newHashSet();
}
PathFilter actualFilter = skipHiddenPaths ? new AndPathFilter(new HiddenFilter(), this.pathFilter) : this.pathFilter;
List<FileStatus> files = FileListUtils.listFilesToCopyAtPath(this.fs, this.rootPath, actualFilter, this.applyFilterToDirectories, includeEmptyDirectories);
return Sets.newHashSet(files);
}
use of org.apache.gobblin.util.filters.AndPathFilter in project incubator-gobblin by apache.
the class UnixTimestampRecursiveCopyableDataset method getFilesAtPath.
@Override
protected List<FileStatus> getFilesAtPath(FileSystem fs, Path path, PathFilter fileFilter) throws IOException {
// Filter files by lookback period (fileNames >= startDate and fileNames <= endDate)
PathFilter andPathFilter = new AndPathFilter(fileFilter, new TimestampPathFilter());
List<FileStatus> files = super.getFilesAtPath(fs, path, andPathFilter);
if (VersionSelectionPolicy.ALL == versionSelectionPolicy) {
return files;
}
Map<Pair<String, LocalDate>, TreeMap<Long, List<FileStatus>>> pathTimestampFilesMap = new HashMap<>();
// Now select files per day based on version selection policy
for (FileStatus fileStatus : files) {
String relativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath()), datasetRoot()).toString();
Matcher matcher = timestampPattern.matcher(relativePath);
if (!matcher.matches()) {
continue;
}
String timestampStr = matcher.group(1);
String rootPath = relativePath.substring(0, relativePath.indexOf(timestampStr));
Long unixTimestamp = Long.parseLong(timestampStr);
LocalDate localDate = new LocalDateTime(unixTimestamp, dateTimeZone).toLocalDate();
Pair<String, LocalDate> key = new ImmutablePair<>(rootPath, localDate);
if (!pathTimestampFilesMap.containsKey(key)) {
pathTimestampFilesMap.put(key, new TreeMap<Long, List<FileStatus>>());
}
Map<Long, List<FileStatus>> timestampFilesMap = pathTimestampFilesMap.get(key);
if (!timestampFilesMap.containsKey(unixTimestamp)) {
timestampFilesMap.put(unixTimestamp, Lists.newArrayList());
}
List<FileStatus> filesStatuses = timestampFilesMap.get(unixTimestamp);
filesStatuses.add(fileStatus);
}
List<FileStatus> result = new ArrayList<>();
for (TreeMap<Long, List<FileStatus>> timestampFileStatus : pathTimestampFilesMap.values()) {
if (timestampFileStatus.size() <= 0) {
continue;
}
switch(versionSelectionPolicy) {
case EARLIEST:
result.addAll(timestampFileStatus.firstEntry().getValue());
break;
case LATEST:
result.addAll(timestampFileStatus.lastEntry().getValue());
break;
default:
throw new RuntimeException("Unsupported version selection policy");
}
}
return result;
}
Aggregations