Search in sources :

Example 1 with CSVConfig

use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.

the class FilterResolutionExactTest method execute.

@Override
public void execute(StandaloneSupport conquery) throws Exception {
    // read test sepcification
    String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
    DatasetId dataset = conquery.getDataset().getId();
    ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
    ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
    CSVConfig csvConf = conquery.getConfig().getCsv();
    test.importRequiredData(conquery);
    FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
    conquery.waitUntilWorkDone();
    Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
    Connector connector = concept.getConnectors().iterator().next();
    AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
    // Copy search csv from resources to tmp folder.
    final Path tmpCSv = Files.createTempFile("conquery_search", "csv");
    Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
    filter.setSearchType(FilterSearch.FilterSearchType.EXACT);
    filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
    FilterSearch.createSourceSearch(filter, csvConf);
    assertThat(filter.getSourceSearch()).isNotNull();
    ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
    // from csv
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "aaa", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
    // from column values
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
}
Also used : Path(java.nio.file.Path) Connector(com.bakdata.conquery.models.datasets.concepts.Connector) AbstractSelectFilter(com.bakdata.conquery.models.datasets.concepts.filters.specific.AbstractSelectFilter) ConqueryTestSpec(com.bakdata.conquery.integration.json.ConqueryTestSpec) ConceptsProcessor(com.bakdata.conquery.resources.api.ConceptsProcessor) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) DatasetId(com.bakdata.conquery.models.identifiable.ids.specific.DatasetId) FilterTemplate(com.bakdata.conquery.apiv1.FilterTemplate) ResolvedConceptsResult(com.bakdata.conquery.resources.api.ConceptsProcessor.ResolvedConceptsResult)

Example 2 with CSVConfig

use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.

the class FilterResolutionPrefixTest method execute.

@Override
public void execute(StandaloneSupport conquery) throws Exception {
    // read test specification
    String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
    DatasetId dataset = conquery.getDataset().getId();
    ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
    ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
    test.importRequiredData(conquery);
    CSVConfig csvConf = conquery.getConfig().getCsv();
    FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
    conquery.waitUntilWorkDone();
    Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
    Connector connector = concept.getConnectors().iterator().next();
    AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
    // Copy search csv from resources to tmp folder.
    final Path tmpCSv = Files.createTempFile("conquery_search", "csv");
    Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
    filter.setSearchType(FilterSearch.FilterSearchType.PREFIX);
    filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
    FilterSearch.createSourceSearch(filter, csvConf);
    assertThat(filter.getSourceSearch()).isNotNull();
    ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
    // from csv
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa", "aab");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
    // from column values
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
}
Also used : Path(java.nio.file.Path) Connector(com.bakdata.conquery.models.datasets.concepts.Connector) AbstractSelectFilter(com.bakdata.conquery.models.datasets.concepts.filters.specific.AbstractSelectFilter) ConqueryTestSpec(com.bakdata.conquery.integration.json.ConqueryTestSpec) ConceptsProcessor(com.bakdata.conquery.resources.api.ConceptsProcessor) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) DatasetId(com.bakdata.conquery.models.identifiable.ids.specific.DatasetId) FilterTemplate(com.bakdata.conquery.apiv1.FilterTemplate) ResolvedConceptsResult(com.bakdata.conquery.resources.api.ConceptsProcessor.ResolvedConceptsResult)

Example 3 with CSVConfig

use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.

the class Preprocessor method preprocess.

/**
 * Apply transformations in descriptor, then write them out to CQPP file for imports.
 * <p>
 * Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
 */
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
    final File preprocessedFile = preprocessingJob.getPreprocessedFile();
    TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
    // Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
    File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
    // Ensures deletion on failure
    tmp.deleteOnExit();
    if (!Files.isWritable(tmp.getParentFile().toPath())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
    }
    if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
    }
    // delete target file if it exists
    if (preprocessedFile.exists()) {
        FileUtils.forceDelete(preprocessedFile);
    }
    log.info("PREPROCESSING START in {}", preprocessingJob);
    int errors = 0;
    final Preprocessed result = new Preprocessed(config, preprocessingJob);
    long lineId = 0;
    // Gather exception classes to get better overview of what kind of errors are happening.
    Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
    exceptions.defaultReturnValue(0);
    for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
        final TableInputDescriptor input = descriptor.getInputs()[inputSource];
        final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
        final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
        ConqueryMDC.setLocation(name);
        if (!(sourceFile.exists() && sourceFile.canRead())) {
            throw new FileNotFoundException(sourceFile.getAbsolutePath());
        }
        CsvParser parser = null;
        try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
            long progress = 0;
            CSVConfig csvSettings = config.getCsv();
            // Create CSV parser according to config, but overriding some behaviour.
            parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
            parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
            final String[] headers = parser.getContext().parsedHeaders();
            final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
            // Compile filter.
            final GroovyPredicate filter = input.createFilter(headers);
            DateReader dateReader = config.getLocale().getDateReader();
            final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
            final List<OutputDescription.Output> outputs = new ArrayList<>();
            final PPColumn[] columns = result.getColumns();
            // Instantiate Outputs based on descriptors (apply header positions)
            for (OutputDescription op : input.getOutput()) {
                outputs.add(op.createForHeaders(headerMap, dateReader));
            }
            String[] row;
            // Read all CSV lines, apply Output transformations and add the to preprocessed.
            while ((row = parser.parseNext()) != null) {
                // This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
                if (filter != null && !filter.filterRow(row)) {
                    continue;
                }
                try {
                    int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
                    final int primary = result.addPrimary(primaryId);
                    final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
                    result.addRow(primary, columns, outRow);
                } catch (OutputDescription.OutputException e) {
                    exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } catch (Exception e) {
                    exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse line: {} content: {}", lineId, row, e);
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } finally {
                    // report progress
                    totalProgress.addCurrentValue(countingIn.getCount() - progress);
                    progress = countingIn.getCount();
                    lineId++;
                }
            }
        } finally {
            if (parser != null) {
                parser.stopParsing();
            }
        }
    }
    if (errors > 0) {
        log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
    }
    if (log.isWarnEnabled()) {
        exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
    }
    result.write(tmp);
    if (errors > 0) {
        log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
    }
    if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
        throw new RuntimeException("Too many faulty lines.");
    }
    // if successful move the tmp file to the target location
    FileUtils.moveFile(tmp, preprocessedFile);
    log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
Also used : FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) DateReader(com.bakdata.conquery.util.DateReader) GZIPInputStream(java.util.zip.GZIPInputStream) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) CsvParser(com.univocity.parsers.csv.CsvParser) CountingInputStream(com.google.common.io.CountingInputStream) FileInputStream(java.io.FileInputStream) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) ParsingException(com.bakdata.conquery.models.exceptions.ParsingException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) Object2IntArrayMap(it.unimi.dsi.fastutil.objects.Object2IntArrayMap) UtilityClass(lombok.experimental.UtilityClass) File(java.io.File)

Example 4 with CSVConfig

use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.

the class FilterResolutionContainsTest method execute.

@Override
public void execute(StandaloneSupport conquery) throws Exception {
    // read test sepcification
    String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
    DatasetId dataset = conquery.getDataset().getId();
    ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
    ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
    test.importRequiredData(conquery);
    CSVConfig csvConf = conquery.getConfig().getCsv();
    FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
    conquery.waitUntilWorkDone();
    Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
    Connector connector = concept.getConnectors().iterator().next();
    AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
    // Copy search csv from resources to tmp folder.
    final Path tmpCSv = Files.createTempFile("conquery_search", ".csv");
    Out.file(tmpCSv.toFile()).withUTF8().writeLines(lines);
    Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
    filter.setSearchType(FilterSearch.FilterSearchType.CONTAINS);
    filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
    FilterSearch.createSourceSearch(filter, csvConf);
    assertThat(filter.getSourceSearch()).isNotNull();
    ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
    // from csv
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa", "aab", "baaa");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
    // from column values
    {
        ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
        // check the resolved values
        assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
        assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
    }
}
Also used : Path(java.nio.file.Path) Connector(com.bakdata.conquery.models.datasets.concepts.Connector) AbstractSelectFilter(com.bakdata.conquery.models.datasets.concepts.filters.specific.AbstractSelectFilter) ConqueryTestSpec(com.bakdata.conquery.integration.json.ConqueryTestSpec) ConceptsProcessor(com.bakdata.conquery.resources.api.ConceptsProcessor) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) DatasetId(com.bakdata.conquery.models.identifiable.ids.specific.DatasetId) FilterTemplate(com.bakdata.conquery.apiv1.FilterTemplate) ResolvedConceptsResult(com.bakdata.conquery.resources.api.ConceptsProcessor.ResolvedConceptsResult)

Aggregations

CSVConfig (com.bakdata.conquery.models.config.CSVConfig)4 FilterTemplate (com.bakdata.conquery.apiv1.FilterTemplate)3 ConqueryTestSpec (com.bakdata.conquery.integration.json.ConqueryTestSpec)3 Connector (com.bakdata.conquery.models.datasets.concepts.Connector)3 AbstractSelectFilter (com.bakdata.conquery.models.datasets.concepts.filters.specific.AbstractSelectFilter)3 DatasetId (com.bakdata.conquery.models.identifiable.ids.specific.DatasetId)3 ConceptsProcessor (com.bakdata.conquery.resources.api.ConceptsProcessor)3 ResolvedConceptsResult (com.bakdata.conquery.resources.api.ConceptsProcessor.ResolvedConceptsResult)3 Path (java.nio.file.Path)3 ParsingException (com.bakdata.conquery.models.exceptions.ParsingException)1 OutputDescription (com.bakdata.conquery.models.preproc.outputs.OutputDescription)1 DateReader (com.bakdata.conquery.util.DateReader)1 CountingInputStream (com.google.common.io.CountingInputStream)1 CsvParser (com.univocity.parsers.csv.CsvParser)1 Object2IntArrayMap (it.unimi.dsi.fastutil.objects.Object2IntArrayMap)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1