use of com.bakdata.conquery.models.preproc.PreprocessingJob in project conquery by bakdata.
the class PreprocessorCommand method run.
@Override
protected void run(Environment environment, Namespace namespace, ConqueryConfig config) throws Exception {
if (pool == null) {
pool = Executors.newFixedThreadPool(config.getPreprocessor().getNThreads());
}
// Tag if present is appended to input-file csvs, output-file cqpp and used as id of cqpps
isFailFast = Optional.ofNullable(namespace.getBoolean("fast-fail")).orElse(false);
isStrict = Optional.ofNullable(namespace.getBoolean("strict")).orElse(true);
final List<String> tags = namespace.<String>getList("tag");
final File inDir = namespace.get("in");
final File outDir = namespace.get("out");
final List<File> descriptionFiles = namespace.<File>getList("desc");
log.info("Preprocessing from command line config.");
final Collection<PreprocessingJob> jobs = new ArrayList<>();
if (tags == null || tags.isEmpty()) {
for (File desc : descriptionFiles) {
final List<PreprocessingJob> descriptions = findPreprocessingDescriptions(desc, inDir, outDir, Optional.empty(), environment.getValidator());
jobs.addAll(descriptions);
}
} else {
for (String tag : tags) {
for (File desc : descriptionFiles) {
final List<PreprocessingJob> jobDescriptions = findPreprocessingDescriptions(desc, inDir, outDir, Optional.of(tag), environment.getValidator());
jobs.addAll(jobDescriptions);
}
}
}
List<PreprocessingJob> missing = new ArrayList<>();
for (PreprocessingJob job : jobs) {
for (TableInputDescriptor input : job.getDescriptor().getInputs()) {
final File sourceFile = Preprocessor.resolveSourceFile(input.getSourceFile(), job.getCsvDirectory(), job.getTag());
if (!sourceFile.exists()) {
log.error("Did not find file `{}` for Preprocessing[{}].", sourceFile, job);
missing.add(job);
}
}
}
// This will halt preprocessing immediately.
if (isStrict && !missing.isEmpty()) {
log.error("FAILED Preprocessing, files are missing.");
doFail();
}
jobs.removeIf(Predicate.not(PreprocessorCommand::requiresProcessing));
final long totalSize = jobs.stream().mapToLong(PreprocessingJob::estimateTotalCsvSizeBytes).sum();
log.info("Required to preprocess {} in total", BinaryByteUnit.format(totalSize));
ProgressBar totalProgress = new ProgressBar(totalSize, System.out);
for (PreprocessingJob job : jobs) {
pool.submit(() -> {
ConqueryMDC.setLocation(job.toString());
try {
Preprocessor.preprocess(job, totalProgress, config);
success.add(job.toString());
} catch (FileNotFoundException e) {
log.warn("Did not find file `{}` for preprocessing.", e.getMessage());
addMissing(job);
} catch (Exception e) {
log.error("Failed to preprocess " + LogUtil.printPath(job.getDescriptionFile()), e);
addFailed(job);
}
});
}
pool.shutdown();
pool.awaitTermination(24, TimeUnit.HOURS);
ConqueryMDC.clearLocation();
if (!success.isEmpty()) {
log.info("Successfully Preprocess {} Jobs:", success.size());
success.forEach(desc -> log.info("\tSucceeded Preprocessing for {}", desc));
}
if (!missing.isEmpty()) {
log.warn("Did not find {} Files", missing.size());
missing.forEach(desc -> log.warn("\tDid not find file for {}", desc));
}
if (isFailed()) {
log.error("Failed {} Preprocessing Jobs:", failed.size());
failed.forEach(desc -> log.error("\tFailed Preprocessing for {}", desc));
doFail();
}
}
use of com.bakdata.conquery.models.preproc.PreprocessingJob in project conquery by bakdata.
the class PreprocessorCommand method tryExtractDescriptor.
private Optional<PreprocessingJob> tryExtractDescriptor(Validator validator, Optional<String> tag, File descriptionFile, File outputDir, File csvDir) throws IOException {
try {
final TableImportDescriptor descriptor = TableImportDescriptor.read(descriptionFile);
validator.validate(validator);
final PreprocessingJob preprocessingJob = new PreprocessingJob(csvDir.toPath(), descriptionFile, outputDir.toPath(), tag, descriptor);
// Override name to tag if present
tag.ifPresent(descriptor::setName);
return Optional.of(preprocessingJob);
} catch (Exception e) {
log.error("Failed to process " + LogUtil.printPath(descriptionFile), e);
if (isFailFast) {
doFail();
}
failed.add(descriptionFile.toString());
}
return Optional.empty();
}
use of com.bakdata.conquery.models.preproc.PreprocessingJob in project conquery by bakdata.
the class PreprocessorCommand method findPreprocessingDescriptions.
public List<PreprocessingJob> findPreprocessingDescriptions(File descriptionFiles, File inDir, File outputDir, Optional<String> tag, Validator validator) throws IOException {
List<PreprocessingJob> out = new ArrayList<>();
final File[] files = descriptionFiles.isFile() ? new File[] { descriptionFiles } : descriptionFiles.listFiles(((dir, name) -> name.endsWith(ConqueryConstants.EXTENSION_DESCRIPTION)));
if (files == null) {
return Collections.emptyList();
}
for (File descriptionFile : files) {
tryExtractDescriptor(validator, tag, descriptionFile, outputDir, inDir).ifPresent(out::add);
}
return out;
}
Aggregations