Search in sources :

Example 1 with DateReader

use of com.bakdata.conquery.util.DateReader in project conquery by bakdata.

the class Preprocessor method preprocess.

/**
 * Apply transformations in descriptor, then write them out to CQPP file for imports.
 * <p>
 * Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
 */
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
    final File preprocessedFile = preprocessingJob.getPreprocessedFile();
    TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
    // Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
    File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
    // Ensures deletion on failure
    tmp.deleteOnExit();
    if (!Files.isWritable(tmp.getParentFile().toPath())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
    }
    if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
    }
    // delete target file if it exists
    if (preprocessedFile.exists()) {
        FileUtils.forceDelete(preprocessedFile);
    }
    log.info("PREPROCESSING START in {}", preprocessingJob);
    int errors = 0;
    final Preprocessed result = new Preprocessed(config, preprocessingJob);
    long lineId = 0;
    // Gather exception classes to get better overview of what kind of errors are happening.
    Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
    exceptions.defaultReturnValue(0);
    for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
        final TableInputDescriptor input = descriptor.getInputs()[inputSource];
        final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
        final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
        ConqueryMDC.setLocation(name);
        if (!(sourceFile.exists() && sourceFile.canRead())) {
            throw new FileNotFoundException(sourceFile.getAbsolutePath());
        }
        CsvParser parser = null;
        try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
            long progress = 0;
            CSVConfig csvSettings = config.getCsv();
            // Create CSV parser according to config, but overriding some behaviour.
            parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
            parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
            final String[] headers = parser.getContext().parsedHeaders();
            final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
            // Compile filter.
            final GroovyPredicate filter = input.createFilter(headers);
            DateReader dateReader = config.getLocale().getDateReader();
            final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
            final List<OutputDescription.Output> outputs = new ArrayList<>();
            final PPColumn[] columns = result.getColumns();
            // Instantiate Outputs based on descriptors (apply header positions)
            for (OutputDescription op : input.getOutput()) {
                outputs.add(op.createForHeaders(headerMap, dateReader));
            }
            String[] row;
            // Read all CSV lines, apply Output transformations and add the to preprocessed.
            while ((row = parser.parseNext()) != null) {
                // This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
                if (filter != null && !filter.filterRow(row)) {
                    continue;
                }
                try {
                    int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
                    final int primary = result.addPrimary(primaryId);
                    final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
                    result.addRow(primary, columns, outRow);
                } catch (OutputDescription.OutputException e) {
                    exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } catch (Exception e) {
                    exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse line: {} content: {}", lineId, row, e);
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } finally {
                    // report progress
                    totalProgress.addCurrentValue(countingIn.getCount() - progress);
                    progress = countingIn.getCount();
                    lineId++;
                }
            }
        } finally {
            if (parser != null) {
                parser.stopParsing();
            }
        }
    }
    if (errors > 0) {
        log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
    }
    if (log.isWarnEnabled()) {
        exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
    }
    result.write(tmp);
    if (errors > 0) {
        log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
    }
    if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
        throw new RuntimeException("Too many faulty lines.");
    }
    // if successful move the tmp file to the target location
    FileUtils.moveFile(tmp, preprocessedFile);
    log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
Also used : FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) DateReader(com.bakdata.conquery.util.DateReader) GZIPInputStream(java.util.zip.GZIPInputStream) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) CsvParser(com.univocity.parsers.csv.CsvParser) CountingInputStream(com.google.common.io.CountingInputStream) FileInputStream(java.io.FileInputStream) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) ParsingException(com.bakdata.conquery.models.exceptions.ParsingException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) Object2IntArrayMap(it.unimi.dsi.fastutil.objects.Object2IntArrayMap) UtilityClass(lombok.experimental.UtilityClass) File(java.io.File)

Aggregations

CSVConfig (com.bakdata.conquery.models.config.CSVConfig)1 ParsingException (com.bakdata.conquery.models.exceptions.ParsingException)1 OutputDescription (com.bakdata.conquery.models.preproc.outputs.OutputDescription)1 DateReader (com.bakdata.conquery.util.DateReader)1 CountingInputStream (com.google.common.io.CountingInputStream)1 CsvParser (com.univocity.parsers.csv.CsvParser)1 Object2IntArrayMap (it.unimi.dsi.fastutil.objects.Object2IntArrayMap)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 UtilityClass (lombok.experimental.UtilityClass)1