Search in sources :

Example 1 with OutputDescription

use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.

the class ImportUpdateTest method execute.

@Override
public void execute(String name, TestConquery testConquery) throws Exception {
    final StandaloneSupport conquery = testConquery.getSupport(name);
    MetaStorage storage = conquery.getMetaStorage();
    String testJson = In.resource("/tests/query/UPDATE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.json").withUTF8().readAll();
    final Dataset dataset = conquery.getDataset();
    final Namespace namespace = conquery.getNamespace();
    final ImportId importId1 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table1", "table1");
    final ImportId importId2 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table2", "table2");
    QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
    final List<RequiredTable> tables = test.getContent().getTables();
    assertThat(tables.size()).isEqualTo(2);
    List<File> cqpps;
    // Manually import data, so we can do our own work.
    {
        ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
        importSecondaryIds(conquery, test.getContent().getSecondaryIds());
        conquery.waitUntilWorkDone();
        LoadingUtil.importTables(conquery, tables);
        conquery.waitUntilWorkDone();
        LoadingUtil.importConcepts(conquery, test.getRawConcepts());
        conquery.waitUntilWorkDone();
        cqpps = LoadingUtil.generateCqpp(conquery, tables);
        conquery.waitUntilWorkDone();
        assertThat(cqpps.size()).isEqualTo(tables.size());
        LoadingUtil.importCqppFiles(conquery, List.of(cqpps.get(0)));
        conquery.waitUntilWorkDone();
    }
    final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
    // State before update.
    {
        log.info("Checking state before update");
        assertThat(namespace.getStorage().getAllImports()).hasSize(1);
        // Must contain the import.
        assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
        assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
        for (ShardNode node : conquery.getShardNodes()) {
            for (Worker worker : node.getWorkers().getWorkers().values()) {
                if (!worker.getInfo().getDataset().equals(dataset.getId())) {
                    continue;
                }
                final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
                assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
                // Must contain the import.
                assertThat(workerStorage.getImport(importId1)).isNotNull();
            }
        }
        assertThat(namespace.getNumberOfEntities()).isEqualTo(4);
        // assert that the query can be executed after the import
        IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
    }
    // Try to update an import that does not exist should throw a Not-Found Webapplication Exception
    LoadingUtil.updateCqppFile(conquery, cqpps.get(1), Response.Status.Family.CLIENT_ERROR, "Not Found");
    conquery.waitUntilWorkDone();
    // Load manually new data for import and update the concerned import
    {
        log.info("Manually loading new data for import");
        final RequiredTable importTable = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId1.getTable().getTable())).findFirst().orElseThrow();
        final String csvName = importTable.getCsv().getName();
        final String path = importTable.getCsv().getPath();
        // copy new content of the importTable into the csv-File used by the preprocessor to avoid creating multiple files withe same names
        FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + csvName.replace(".csv", ".update.csv")).asStream(), new File(conquery.getTmpDir(), csvName));
        File descriptionFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
        File newPreprocessedFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
        // create import descriptor
        {
            TableImportDescriptor desc = new TableImportDescriptor();
            desc.setName(importTable.getName());
            desc.setTable(importTable.getName());
            TableInputDescriptor input = new TableInputDescriptor();
            {
                input.setPrimary(importTable.getPrimaryColumn().createOutput());
                input.setSourceFile(csvName);
                input.setOutput(new OutputDescription[importTable.getColumns().length]);
                for (int i = 0; i < importTable.getColumns().length; i++) {
                    input.getOutput()[i] = importTable.getColumns()[i].createOutput();
                }
            }
            desc.setInputs(new TableInputDescriptor[] { input });
            Jackson.MAPPER.writeValue(descriptionFile, desc);
        }
        // preprocess
        conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
        log.info("updating import");
        // correct update of the import
        LoadingUtil.updateCqppFile(conquery, newPreprocessedFile, Response.Status.Family.SUCCESSFUL, "No Content");
        conquery.waitUntilWorkDone();
    }
    // State after update.
    {
        log.info("Checking state after update");
        assertThat(namespace.getStorage().getAllImports()).hasSize(1);
        // Must contain the import.
        assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
        assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
        for (ShardNode node : conquery.getShardNodes()) {
            for (Worker worker : node.getWorkers().getWorkers().values()) {
                if (!worker.getInfo().getDataset().equals(dataset.getId())) {
                    continue;
                }
                final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
                assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
                // Must contain the import.
                assertThat(workerStorage.getImport(importId1)).isNotNull();
            }
        }
        assertThat(namespace.getNumberOfEntities()).isEqualTo(9);
        // Issue a query and assert that it has more content.
        IntegrationUtils.assertQueryResult(conquery, query, 4L, ExecutionState.DONE, conquery.getTestUser(), 201);
    }
}
Also used : ExecutionState(com.bakdata.conquery.models.execution.ExecutionState) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) RequiredTable(com.bakdata.conquery.integration.common.RequiredTable) QueryTest(com.bakdata.conquery.integration.json.QueryTest) Worker(com.bakdata.conquery.models.worker.Worker) ModificationShieldedWorkerStorage(com.bakdata.conquery.io.storage.ModificationShieldedWorkerStorage) In(com.github.powerlibraries.io.In) LoadingUtil.importSecondaryIds(com.bakdata.conquery.integration.common.LoadingUtil.importSecondaryIds) ConqueryConstants(com.bakdata.conquery.ConqueryConstants) IntegrationUtils(com.bakdata.conquery.integration.common.IntegrationUtils) TableImportDescriptor(com.bakdata.conquery.models.preproc.TableImportDescriptor) TestConquery(com.bakdata.conquery.util.support.TestConquery) ShardNode(com.bakdata.conquery.commands.ShardNode) FileUtils(org.apache.commons.io.FileUtils) LoadingUtil(com.bakdata.conquery.integration.common.LoadingUtil) File(java.io.File) ImportId(com.bakdata.conquery.models.identifiable.ids.specific.ImportId) StandaloneSupport(com.bakdata.conquery.util.support.StandaloneSupport) Dataset(com.bakdata.conquery.models.datasets.Dataset) ValidatorHelper(com.bakdata.conquery.models.exceptions.ValidatorHelper) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Response(javax.ws.rs.core.Response) TableInputDescriptor(com.bakdata.conquery.models.preproc.TableInputDescriptor) JsonIntegrationTest(com.bakdata.conquery.integration.json.JsonIntegrationTest) Query(com.bakdata.conquery.apiv1.query.Query) Jackson(com.bakdata.conquery.io.jackson.Jackson) MetaStorage(com.bakdata.conquery.io.storage.MetaStorage) Namespace(com.bakdata.conquery.models.worker.Namespace) Query(com.bakdata.conquery.apiv1.query.Query) QueryTest(com.bakdata.conquery.integration.json.QueryTest) Dataset(com.bakdata.conquery.models.datasets.Dataset) ImportId(com.bakdata.conquery.models.identifiable.ids.specific.ImportId) Namespace(com.bakdata.conquery.models.worker.Namespace) ModificationShieldedWorkerStorage(com.bakdata.conquery.io.storage.ModificationShieldedWorkerStorage) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) ShardNode(com.bakdata.conquery.commands.ShardNode) MetaStorage(com.bakdata.conquery.io.storage.MetaStorage) Worker(com.bakdata.conquery.models.worker.Worker) TableInputDescriptor(com.bakdata.conquery.models.preproc.TableInputDescriptor) StandaloneSupport(com.bakdata.conquery.util.support.StandaloneSupport) File(java.io.File) TableImportDescriptor(com.bakdata.conquery.models.preproc.TableImportDescriptor) RequiredTable(com.bakdata.conquery.integration.common.RequiredTable)

Example 2 with OutputDescription

use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.

the class Preprocessor method preprocess.

/**
 * Apply transformations in descriptor, then write them out to CQPP file for imports.
 * <p>
 * Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
 */
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
    final File preprocessedFile = preprocessingJob.getPreprocessedFile();
    TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
    // Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
    File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
    // Ensures deletion on failure
    tmp.deleteOnExit();
    if (!Files.isWritable(tmp.getParentFile().toPath())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
    }
    if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
        throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
    }
    // delete target file if it exists
    if (preprocessedFile.exists()) {
        FileUtils.forceDelete(preprocessedFile);
    }
    log.info("PREPROCESSING START in {}", preprocessingJob);
    int errors = 0;
    final Preprocessed result = new Preprocessed(config, preprocessingJob);
    long lineId = 0;
    // Gather exception classes to get better overview of what kind of errors are happening.
    Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
    exceptions.defaultReturnValue(0);
    for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
        final TableInputDescriptor input = descriptor.getInputs()[inputSource];
        final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
        final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
        ConqueryMDC.setLocation(name);
        if (!(sourceFile.exists() && sourceFile.canRead())) {
            throw new FileNotFoundException(sourceFile.getAbsolutePath());
        }
        CsvParser parser = null;
        try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
            long progress = 0;
            CSVConfig csvSettings = config.getCsv();
            // Create CSV parser according to config, but overriding some behaviour.
            parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
            parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
            final String[] headers = parser.getContext().parsedHeaders();
            final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
            // Compile filter.
            final GroovyPredicate filter = input.createFilter(headers);
            DateReader dateReader = config.getLocale().getDateReader();
            final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
            final List<OutputDescription.Output> outputs = new ArrayList<>();
            final PPColumn[] columns = result.getColumns();
            // Instantiate Outputs based on descriptors (apply header positions)
            for (OutputDescription op : input.getOutput()) {
                outputs.add(op.createForHeaders(headerMap, dateReader));
            }
            String[] row;
            // Read all CSV lines, apply Output transformations and add the to preprocessed.
            while ((row = parser.parseNext()) != null) {
                // This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
                if (filter != null && !filter.filterRow(row)) {
                    continue;
                }
                try {
                    int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
                    final int primary = result.addPrimary(primaryId);
                    final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
                    result.addRow(primary, columns, outRow);
                } catch (OutputDescription.OutputException e) {
                    exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } catch (Exception e) {
                    exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
                    errors++;
                    if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("Failed to parse line: {} content: {}", lineId, row, e);
                    } else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
                        log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
                    }
                } finally {
                    // report progress
                    totalProgress.addCurrentValue(countingIn.getCount() - progress);
                    progress = countingIn.getCount();
                    lineId++;
                }
            }
        } finally {
            if (parser != null) {
                parser.stopParsing();
            }
        }
    }
    if (errors > 0) {
        log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
    }
    if (log.isWarnEnabled()) {
        exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
    }
    result.write(tmp);
    if (errors > 0) {
        log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
    }
    if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
        throw new RuntimeException("Too many faulty lines.");
    }
    // if successful move the tmp file to the target location
    FileUtils.moveFile(tmp, preprocessedFile);
    log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
Also used : FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) DateReader(com.bakdata.conquery.util.DateReader) GZIPInputStream(java.util.zip.GZIPInputStream) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) CsvParser(com.univocity.parsers.csv.CsvParser) CountingInputStream(com.google.common.io.CountingInputStream) FileInputStream(java.io.FileInputStream) CSVConfig(com.bakdata.conquery.models.config.CSVConfig) ParsingException(com.bakdata.conquery.models.exceptions.ParsingException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) Object2IntArrayMap(it.unimi.dsi.fastutil.objects.Object2IntArrayMap) UtilityClass(lombok.experimental.UtilityClass) File(java.io.File)

Example 3 with OutputDescription

use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.

the class TableInputDescriptor method hashCode.

/**
 * Hashcode is used to in validity-hash of Preprocessed files.
 */
public int hashCode() {
    final HashCodeBuilder builder = new HashCodeBuilder();
    builder.append(getSourceFile());
    builder.append(getFilter());
    builder.append(getPrimary());
    for (OutputDescription outputDescription : getOutput()) {
        builder.append(outputDescription.hashCode());
    }
    return builder.toHashCode();
}
Also used : OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) HashCodeBuilder(org.apache.commons.lang3.builder.HashCodeBuilder)

Example 4 with OutputDescription

use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.

the class ImportDeletionTest method execute.

@Override
public void execute(String name, TestConquery testConquery) throws Exception {
    final StandaloneSupport conquery = testConquery.getSupport(name);
    MetaStorage storage = conquery.getMetaStorage();
    final String testJson = In.resource("/tests/query/DELETE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.test.json").withUTF8().readAll();
    final Dataset dataset = conquery.getDataset();
    final Namespace namespace = conquery.getNamespace();
    final ImportId importId = ImportId.Parser.INSTANCE.parse(dataset.getName(), "test_table2", "test_table2");
    final QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
    // Manually import data, so we can do our own work.
    {
        ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
        importSecondaryIds(conquery, test.getContent().getSecondaryIds());
        conquery.waitUntilWorkDone();
        LoadingUtil.importTables(conquery, test.getContent().getTables());
        conquery.waitUntilWorkDone();
        LoadingUtil.importConcepts(conquery, test.getRawConcepts());
        conquery.waitUntilWorkDone();
        LoadingUtil.importTableContents(conquery, test.getContent().getTables());
        conquery.waitUntilWorkDone();
    }
    final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
    final int nImports = namespace.getStorage().getAllImports().size();
    // State before deletion.
    {
        log.info("Checking state before deletion");
        // Must contain the import.
        assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isNotEmpty();
        assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId)).isNotEmpty();
        for (ShardNode node : conquery.getShardNodes()) {
            for (Worker worker : node.getWorkers().getWorkers().values()) {
                if (!worker.getInfo().getDataset().equals(dataset.getId())) {
                    continue;
                }
                final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
                assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
                // Must contain the import.
                assertThat(workerStorage.getImport(importId)).isNotNull();
            }
        }
        log.info("Executing query before deletion");
        IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
    }
    // Delete the import.
    {
        log.info("Issuing deletion of import {}", importId);
        final URI deleteImportUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder(), AdminTablesResource.class, "deleteImport").buildFromMap(Map.of(ResourceConstants.DATASET, conquery.getDataset().getId(), ResourceConstants.TABLE, importId.getTable(), ResourceConstants.IMPORT_ID, importId));
        final Response delete = conquery.getClient().target(deleteImportUri).request(MediaType.APPLICATION_JSON).delete();
        assertThat(delete.getStatusInfo().getFamily()).isEqualTo(Response.Status.Family.SUCCESSFUL);
        conquery.waitUntilWorkDone();
    }
    // State after deletion.
    {
        log.info("Checking state after deletion");
        // We have deleted an import now there should be one less!
        assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports - 1);
        // The deleted import should not be found.
        assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isEmpty();
        for (ShardNode node : conquery.getShardNodes()) {
            for (Worker worker : node.getWorkers().getWorkers().values()) {
                if (!worker.getInfo().getDataset().equals(dataset.getId())) {
                    continue;
                }
                final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                // No bucket should be found referencing the import.
                assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isEmpty();
                // No CBlock associated with import may exist
                assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(cBlock -> cBlock.getBucket().getId().getImp().equals(importId)).isEmpty();
                // Import should not exists anymore
                assertThat(workerStorage.getImport(importId)).describedAs("Import for Worker %s", worker.getInfo().getId()).isNull();
            }
        }
        log.info("Executing query after deletion");
        // Issue a query and assert that it has less content.
        IntegrationUtils.assertQueryResult(conquery, query, 1L, ExecutionState.DONE, conquery.getTestUser(), 201);
    }
    conquery.waitUntilWorkDone();
    // Load more data under the same name into the same table, with only the deleted import/table
    {
        // only import the deleted import/table
        final RequiredTable import2Table = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId.getTable().getTable())).findFirst().orElseThrow();
        final ResourceFile csv = import2Table.getCsv();
        final String path = csv.getPath();
        // copy csv to tmp folder
        // Content 2.2 contains an extra entry of a value that hasn't been seen before.
        FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + "content2.2.csv").asStream(), new File(conquery.getTmpDir(), csv.getName()));
        File descriptionFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
        File preprocessedFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
        // create import descriptor
        TableImportDescriptor desc = new TableImportDescriptor();
        desc.setName(import2Table.getName());
        desc.setTable(import2Table.getName());
        TableInputDescriptor input = new TableInputDescriptor();
        {
            input.setPrimary(import2Table.getPrimaryColumn().createOutput());
            input.setSourceFile(import2Table.getCsv().getName());
            input.setOutput(new OutputDescription[import2Table.getColumns().length]);
            for (int i = 0; i < import2Table.getColumns().length; i++) {
                input.getOutput()[i] = import2Table.getColumns()[i].createOutput();
            }
        }
        desc.setInputs(new TableInputDescriptor[] { input });
        Jackson.MAPPER.writeValue(descriptionFile, desc);
        // preprocess
        conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
        // import preprocessedFiles
        conquery.getDatasetsProcessor().addImport(conquery.getNamespace(), new GZIPInputStream(new FileInputStream(preprocessedFile)));
        conquery.waitUntilWorkDone();
    }
    // State after reimport.
    {
        log.info("Checking state after re-import");
        assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports);
        for (ShardNode node : conquery.getShardNodes()) {
            for (Worker worker : node.getWorkers().getWorkers().values()) {
                if (!worker.getInfo().getDataset().equals(dataset.getId())) {
                    continue;
                }
                final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).isNotEmpty();
            }
        }
        log.info("Executing query after re-import");
        // Issue a query and assert that it has the same content as the first time around.
        IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
    }
    // Finally, restart conquery and assert again, that the data is correct.
    {
        testConquery.shutdown();
        // restart
        testConquery.beforeAll();
        StandaloneSupport conquery2 = testConquery.openDataset(dataset.getId());
        log.info("Checking state after re-start");
        {
            assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(2);
            for (ShardNode node : conquery2.getShardNodes()) {
                for (Worker worker : node.getWorkers().getWorkers().values()) {
                    if (!worker.getInfo().getDataset().equals(dataset.getId()))
                        continue;
                    final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
                    assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isNotEmpty();
                }
            }
            log.info("Executing query after re-import");
            // Issue a query and assert that it has the same content as the first time around.
            IntegrationUtils.assertQueryResult(conquery2, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
        }
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) ExecutionState(com.bakdata.conquery.models.execution.ExecutionState) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) AdminTablesResource(com.bakdata.conquery.resources.admin.rest.AdminTablesResource) RequiredTable(com.bakdata.conquery.integration.common.RequiredTable) QueryTest(com.bakdata.conquery.integration.json.QueryTest) Worker(com.bakdata.conquery.models.worker.Worker) MediaType(javax.ws.rs.core.MediaType) ModificationShieldedWorkerStorage(com.bakdata.conquery.io.storage.ModificationShieldedWorkerStorage) In(com.github.powerlibraries.io.In) Map(java.util.Map) LoadingUtil.importSecondaryIds(com.bakdata.conquery.integration.common.LoadingUtil.importSecondaryIds) URI(java.net.URI) ConqueryConstants(com.bakdata.conquery.ConqueryConstants) ResourceFile(com.bakdata.conquery.integration.common.ResourceFile) IntegrationUtils(com.bakdata.conquery.integration.common.IntegrationUtils) ProgrammaticIntegrationTest(com.bakdata.conquery.integration.tests.ProgrammaticIntegrationTest) TableImportDescriptor(com.bakdata.conquery.models.preproc.TableImportDescriptor) TestConquery(com.bakdata.conquery.util.support.TestConquery) ShardNode(com.bakdata.conquery.commands.ShardNode) ResourceConstants(com.bakdata.conquery.resources.ResourceConstants) FileUtils(org.apache.commons.io.FileUtils) FileInputStream(java.io.FileInputStream) LoadingUtil(com.bakdata.conquery.integration.common.LoadingUtil) File(java.io.File) ImportId(com.bakdata.conquery.models.identifiable.ids.specific.ImportId) StandaloneSupport(com.bakdata.conquery.util.support.StandaloneSupport) Dataset(com.bakdata.conquery.models.datasets.Dataset) ValidatorHelper(com.bakdata.conquery.models.exceptions.ValidatorHelper) OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Response(javax.ws.rs.core.Response) TableInputDescriptor(com.bakdata.conquery.models.preproc.TableInputDescriptor) JsonIntegrationTest(com.bakdata.conquery.integration.json.JsonIntegrationTest) Query(com.bakdata.conquery.apiv1.query.Query) Jackson(com.bakdata.conquery.io.jackson.Jackson) MetaStorage(com.bakdata.conquery.io.storage.MetaStorage) HierarchyHelper(com.bakdata.conquery.resources.hierarchies.HierarchyHelper) Namespace(com.bakdata.conquery.models.worker.Namespace) Query(com.bakdata.conquery.apiv1.query.Query) QueryTest(com.bakdata.conquery.integration.json.QueryTest) Dataset(com.bakdata.conquery.models.datasets.Dataset) ImportId(com.bakdata.conquery.models.identifiable.ids.specific.ImportId) URI(java.net.URI) Namespace(com.bakdata.conquery.models.worker.Namespace) FileInputStream(java.io.FileInputStream) ModificationShieldedWorkerStorage(com.bakdata.conquery.io.storage.ModificationShieldedWorkerStorage) Response(javax.ws.rs.core.Response) GZIPInputStream(java.util.zip.GZIPInputStream) ResourceFile(com.bakdata.conquery.integration.common.ResourceFile) ShardNode(com.bakdata.conquery.commands.ShardNode) MetaStorage(com.bakdata.conquery.io.storage.MetaStorage) Worker(com.bakdata.conquery.models.worker.Worker) TableInputDescriptor(com.bakdata.conquery.models.preproc.TableInputDescriptor) StandaloneSupport(com.bakdata.conquery.util.support.StandaloneSupport) ResourceFile(com.bakdata.conquery.integration.common.ResourceFile) File(java.io.File) TableImportDescriptor(com.bakdata.conquery.models.preproc.TableImportDescriptor) RequiredTable(com.bakdata.conquery.integration.common.RequiredTable)

Example 5 with OutputDescription

use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.

the class LoadingUtil method generateCqpp.

public static List<File> generateCqpp(StandaloneSupport support, Collection<RequiredTable> tables) throws Exception {
    List<File> preprocessedFiles = new ArrayList<>();
    List<File> descriptions = new ArrayList<>();
    for (RequiredTable rTable : tables) {
        // copy csv to tmp folder
        String name = rTable.getName();
        FileUtils.copyInputStreamToFile(rTable.getCsv().stream(), new File(support.getTmpDir(), rTable.getCsv().getName()));
        // create import descriptor
        final File descriptionFile = support.getTmpDir().toPath().resolve(name + ConqueryConstants.EXTENSION_DESCRIPTION).toFile();
        final File outFile = support.getTmpDir().toPath().resolve(name + EXTENSION_PREPROCESSED).toFile();
        TableImportDescriptor desc = new TableImportDescriptor();
        desc.setName(name);
        desc.setTable(name);
        TableInputDescriptor input = new TableInputDescriptor();
        {
            input.setPrimary(rTable.getPrimaryColumn().createOutput());
            input.setSourceFile(rTable.getCsv().getName());
            input.setOutput(new OutputDescription[rTable.getColumns().length]);
            for (int i = 0; i < rTable.getColumns().length; i++) {
                input.getOutput()[i] = rTable.getColumns()[i].createOutput();
            }
        }
        desc.setInputs(new TableInputDescriptor[] { input });
        Jackson.MAPPER.writeValue(descriptionFile, desc);
        descriptions.add(descriptionFile);
        preprocessedFiles.add(outFile);
    }
    // preprocess
    support.preprocessTmp(support.getTmpDir(), descriptions);
    // clear the MDC location from the preprocessor
    ConqueryMDC.clearLocation();
    return preprocessedFiles;
}
Also used : OutputDescription(com.bakdata.conquery.models.preproc.outputs.OutputDescription) ArrayList(java.util.ArrayList) TableInputDescriptor(com.bakdata.conquery.models.preproc.TableInputDescriptor) File(java.io.File) TableImportDescriptor(com.bakdata.conquery.models.preproc.TableImportDescriptor)

Aggregations

OutputDescription (com.bakdata.conquery.models.preproc.outputs.OutputDescription)5 File (java.io.File)4 TableImportDescriptor (com.bakdata.conquery.models.preproc.TableImportDescriptor)3 TableInputDescriptor (com.bakdata.conquery.models.preproc.TableInputDescriptor)3 ConqueryConstants (com.bakdata.conquery.ConqueryConstants)2 Query (com.bakdata.conquery.apiv1.query.Query)2 ShardNode (com.bakdata.conquery.commands.ShardNode)2 IntegrationUtils (com.bakdata.conquery.integration.common.IntegrationUtils)2 LoadingUtil (com.bakdata.conquery.integration.common.LoadingUtil)2 LoadingUtil.importSecondaryIds (com.bakdata.conquery.integration.common.LoadingUtil.importSecondaryIds)2 RequiredTable (com.bakdata.conquery.integration.common.RequiredTable)2 JsonIntegrationTest (com.bakdata.conquery.integration.json.JsonIntegrationTest)2 QueryTest (com.bakdata.conquery.integration.json.QueryTest)2 Jackson (com.bakdata.conquery.io.jackson.Jackson)2 MetaStorage (com.bakdata.conquery.io.storage.MetaStorage)2 ModificationShieldedWorkerStorage (com.bakdata.conquery.io.storage.ModificationShieldedWorkerStorage)2 Dataset (com.bakdata.conquery.models.datasets.Dataset)2 ValidatorHelper (com.bakdata.conquery.models.exceptions.ValidatorHelper)2 ExecutionState (com.bakdata.conquery.models.execution.ExecutionState)2 ImportId (com.bakdata.conquery.models.identifiable.ids.specific.ImportId)2