use of com.bakdata.conquery.util.DateReader in project conquery by bakdata.
the class Preprocessor method preprocess.
/**
* Apply transformations in descriptor, then write them out to CQPP file for imports.
* <p>
* Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
*/
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
final File preprocessedFile = preprocessingJob.getPreprocessedFile();
TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
// Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
// Ensures deletion on failure
tmp.deleteOnExit();
if (!Files.isWritable(tmp.getParentFile().toPath())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
}
if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
}
// delete target file if it exists
if (preprocessedFile.exists()) {
FileUtils.forceDelete(preprocessedFile);
}
log.info("PREPROCESSING START in {}", preprocessingJob);
int errors = 0;
final Preprocessed result = new Preprocessed(config, preprocessingJob);
long lineId = 0;
// Gather exception classes to get better overview of what kind of errors are happening.
Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
exceptions.defaultReturnValue(0);
for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
final TableInputDescriptor input = descriptor.getInputs()[inputSource];
final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
ConqueryMDC.setLocation(name);
if (!(sourceFile.exists() && sourceFile.canRead())) {
throw new FileNotFoundException(sourceFile.getAbsolutePath());
}
CsvParser parser = null;
try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
long progress = 0;
CSVConfig csvSettings = config.getCsv();
// Create CSV parser according to config, but overriding some behaviour.
parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
final String[] headers = parser.getContext().parsedHeaders();
final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
// Compile filter.
final GroovyPredicate filter = input.createFilter(headers);
DateReader dateReader = config.getLocale().getDateReader();
final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
final List<OutputDescription.Output> outputs = new ArrayList<>();
final PPColumn[] columns = result.getColumns();
// Instantiate Outputs based on descriptors (apply header positions)
for (OutputDescription op : input.getOutput()) {
outputs.add(op.createForHeaders(headerMap, dateReader));
}
String[] row;
// Read all CSV lines, apply Output transformations and add the to preprocessed.
while ((row = parser.parseNext()) != null) {
// This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
if (filter != null && !filter.filterRow(row)) {
continue;
}
try {
int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
final int primary = result.addPrimary(primaryId);
final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
result.addRow(primary, columns, outRow);
} catch (OutputDescription.OutputException e) {
exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} catch (Exception e) {
exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse line: {} content: {}", lineId, row, e);
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} finally {
// report progress
totalProgress.addCurrentValue(countingIn.getCount() - progress);
progress = countingIn.getCount();
lineId++;
}
}
} finally {
if (parser != null) {
parser.stopParsing();
}
}
}
if (errors > 0) {
log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
}
if (log.isWarnEnabled()) {
exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
}
result.write(tmp);
if (errors > 0) {
log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
}
if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
throw new RuntimeException("Too many faulty lines.");
}
// if successful move the tmp file to the target location
FileUtils.moveFile(tmp, preprocessedFile);
log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
Aggregations