use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.
the class ImportUpdateTest method execute.
@Override
public void execute(String name, TestConquery testConquery) throws Exception {
final StandaloneSupport conquery = testConquery.getSupport(name);
MetaStorage storage = conquery.getMetaStorage();
String testJson = In.resource("/tests/query/UPDATE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.json").withUTF8().readAll();
final Dataset dataset = conquery.getDataset();
final Namespace namespace = conquery.getNamespace();
final ImportId importId1 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table1", "table1");
final ImportId importId2 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table2", "table2");
QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
final List<RequiredTable> tables = test.getContent().getTables();
assertThat(tables.size()).isEqualTo(2);
List<File> cqpps;
// Manually import data, so we can do our own work.
{
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
importSecondaryIds(conquery, test.getContent().getSecondaryIds());
conquery.waitUntilWorkDone();
LoadingUtil.importTables(conquery, tables);
conquery.waitUntilWorkDone();
LoadingUtil.importConcepts(conquery, test.getRawConcepts());
conquery.waitUntilWorkDone();
cqpps = LoadingUtil.generateCqpp(conquery, tables);
conquery.waitUntilWorkDone();
assertThat(cqpps.size()).isEqualTo(tables.size());
LoadingUtil.importCqppFiles(conquery, List.of(cqpps.get(0)));
conquery.waitUntilWorkDone();
}
final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
// State before update.
{
log.info("Checking state before update");
assertThat(namespace.getStorage().getAllImports()).hasSize(1);
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId1)).isNotNull();
}
}
assertThat(namespace.getNumberOfEntities()).isEqualTo(4);
// assert that the query can be executed after the import
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Try to update an import that does not exist should throw a Not-Found Webapplication Exception
LoadingUtil.updateCqppFile(conquery, cqpps.get(1), Response.Status.Family.CLIENT_ERROR, "Not Found");
conquery.waitUntilWorkDone();
// Load manually new data for import and update the concerned import
{
log.info("Manually loading new data for import");
final RequiredTable importTable = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId1.getTable().getTable())).findFirst().orElseThrow();
final String csvName = importTable.getCsv().getName();
final String path = importTable.getCsv().getPath();
// copy new content of the importTable into the csv-File used by the preprocessor to avoid creating multiple files withe same names
FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + csvName.replace(".csv", ".update.csv")).asStream(), new File(conquery.getTmpDir(), csvName));
File descriptionFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
File newPreprocessedFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
// create import descriptor
{
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(importTable.getName());
desc.setTable(importTable.getName());
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(importTable.getPrimaryColumn().createOutput());
input.setSourceFile(csvName);
input.setOutput(new OutputDescription[importTable.getColumns().length]);
for (int i = 0; i < importTable.getColumns().length; i++) {
input.getOutput()[i] = importTable.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
}
// preprocess
conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
log.info("updating import");
// correct update of the import
LoadingUtil.updateCqppFile(conquery, newPreprocessedFile, Response.Status.Family.SUCCESSFUL, "No Content");
conquery.waitUntilWorkDone();
}
// State after update.
{
log.info("Checking state after update");
assertThat(namespace.getStorage().getAllImports()).hasSize(1);
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId1)).isNotNull();
}
}
assertThat(namespace.getNumberOfEntities()).isEqualTo(9);
// Issue a query and assert that it has more content.
IntegrationUtils.assertQueryResult(conquery, query, 4L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
}
use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.
the class Preprocessor method preprocess.
/**
* Apply transformations in descriptor, then write them out to CQPP file for imports.
* <p>
* Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
*/
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
final File preprocessedFile = preprocessingJob.getPreprocessedFile();
TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
// Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
// Ensures deletion on failure
tmp.deleteOnExit();
if (!Files.isWritable(tmp.getParentFile().toPath())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
}
if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
}
// delete target file if it exists
if (preprocessedFile.exists()) {
FileUtils.forceDelete(preprocessedFile);
}
log.info("PREPROCESSING START in {}", preprocessingJob);
int errors = 0;
final Preprocessed result = new Preprocessed(config, preprocessingJob);
long lineId = 0;
// Gather exception classes to get better overview of what kind of errors are happening.
Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
exceptions.defaultReturnValue(0);
for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
final TableInputDescriptor input = descriptor.getInputs()[inputSource];
final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
ConqueryMDC.setLocation(name);
if (!(sourceFile.exists() && sourceFile.canRead())) {
throw new FileNotFoundException(sourceFile.getAbsolutePath());
}
CsvParser parser = null;
try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
long progress = 0;
CSVConfig csvSettings = config.getCsv();
// Create CSV parser according to config, but overriding some behaviour.
parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
final String[] headers = parser.getContext().parsedHeaders();
final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
// Compile filter.
final GroovyPredicate filter = input.createFilter(headers);
DateReader dateReader = config.getLocale().getDateReader();
final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
final List<OutputDescription.Output> outputs = new ArrayList<>();
final PPColumn[] columns = result.getColumns();
// Instantiate Outputs based on descriptors (apply header positions)
for (OutputDescription op : input.getOutput()) {
outputs.add(op.createForHeaders(headerMap, dateReader));
}
String[] row;
// Read all CSV lines, apply Output transformations and add the to preprocessed.
while ((row = parser.parseNext()) != null) {
// This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
if (filter != null && !filter.filterRow(row)) {
continue;
}
try {
int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
final int primary = result.addPrimary(primaryId);
final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
result.addRow(primary, columns, outRow);
} catch (OutputDescription.OutputException e) {
exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} catch (Exception e) {
exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse line: {} content: {}", lineId, row, e);
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} finally {
// report progress
totalProgress.addCurrentValue(countingIn.getCount() - progress);
progress = countingIn.getCount();
lineId++;
}
}
} finally {
if (parser != null) {
parser.stopParsing();
}
}
}
if (errors > 0) {
log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
}
if (log.isWarnEnabled()) {
exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
}
result.write(tmp);
if (errors > 0) {
log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
}
if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
throw new RuntimeException("Too many faulty lines.");
}
// if successful move the tmp file to the target location
FileUtils.moveFile(tmp, preprocessedFile);
log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.
the class TableInputDescriptor method hashCode.
/**
* Hashcode is used to in validity-hash of Preprocessed files.
*/
public int hashCode() {
final HashCodeBuilder builder = new HashCodeBuilder();
builder.append(getSourceFile());
builder.append(getFilter());
builder.append(getPrimary());
for (OutputDescription outputDescription : getOutput()) {
builder.append(outputDescription.hashCode());
}
return builder.toHashCode();
}
use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.
the class ImportDeletionTest method execute.
@Override
public void execute(String name, TestConquery testConquery) throws Exception {
final StandaloneSupport conquery = testConquery.getSupport(name);
MetaStorage storage = conquery.getMetaStorage();
final String testJson = In.resource("/tests/query/DELETE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.test.json").withUTF8().readAll();
final Dataset dataset = conquery.getDataset();
final Namespace namespace = conquery.getNamespace();
final ImportId importId = ImportId.Parser.INSTANCE.parse(dataset.getName(), "test_table2", "test_table2");
final QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
// Manually import data, so we can do our own work.
{
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
importSecondaryIds(conquery, test.getContent().getSecondaryIds());
conquery.waitUntilWorkDone();
LoadingUtil.importTables(conquery, test.getContent().getTables());
conquery.waitUntilWorkDone();
LoadingUtil.importConcepts(conquery, test.getRawConcepts());
conquery.waitUntilWorkDone();
LoadingUtil.importTableContents(conquery, test.getContent().getTables());
conquery.waitUntilWorkDone();
}
final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
final int nImports = namespace.getStorage().getAllImports().size();
// State before deletion.
{
log.info("Checking state before deletion");
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId)).isNotNull();
}
}
log.info("Executing query before deletion");
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Delete the import.
{
log.info("Issuing deletion of import {}", importId);
final URI deleteImportUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder(), AdminTablesResource.class, "deleteImport").buildFromMap(Map.of(ResourceConstants.DATASET, conquery.getDataset().getId(), ResourceConstants.TABLE, importId.getTable(), ResourceConstants.IMPORT_ID, importId));
final Response delete = conquery.getClient().target(deleteImportUri).request(MediaType.APPLICATION_JSON).delete();
assertThat(delete.getStatusInfo().getFamily()).isEqualTo(Response.Status.Family.SUCCESSFUL);
conquery.waitUntilWorkDone();
}
// State after deletion.
{
log.info("Checking state after deletion");
// We have deleted an import now there should be one less!
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports - 1);
// The deleted import should not be found.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
// No bucket should be found referencing the import.
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isEmpty();
// No CBlock associated with import may exist
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(cBlock -> cBlock.getBucket().getId().getImp().equals(importId)).isEmpty();
// Import should not exists anymore
assertThat(workerStorage.getImport(importId)).describedAs("Import for Worker %s", worker.getInfo().getId()).isNull();
}
}
log.info("Executing query after deletion");
// Issue a query and assert that it has less content.
IntegrationUtils.assertQueryResult(conquery, query, 1L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
conquery.waitUntilWorkDone();
// Load more data under the same name into the same table, with only the deleted import/table
{
// only import the deleted import/table
final RequiredTable import2Table = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId.getTable().getTable())).findFirst().orElseThrow();
final ResourceFile csv = import2Table.getCsv();
final String path = csv.getPath();
// copy csv to tmp folder
// Content 2.2 contains an extra entry of a value that hasn't been seen before.
FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + "content2.2.csv").asStream(), new File(conquery.getTmpDir(), csv.getName()));
File descriptionFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
File preprocessedFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
// create import descriptor
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(import2Table.getName());
desc.setTable(import2Table.getName());
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(import2Table.getPrimaryColumn().createOutput());
input.setSourceFile(import2Table.getCsv().getName());
input.setOutput(new OutputDescription[import2Table.getColumns().length]);
for (int i = 0; i < import2Table.getColumns().length; i++) {
input.getOutput()[i] = import2Table.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
// preprocess
conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
// import preprocessedFiles
conquery.getDatasetsProcessor().addImport(conquery.getNamespace(), new GZIPInputStream(new FileInputStream(preprocessedFile)));
conquery.waitUntilWorkDone();
}
// State after reimport.
{
log.info("Checking state after re-import");
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports);
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).isNotEmpty();
}
}
log.info("Executing query after re-import");
// Issue a query and assert that it has the same content as the first time around.
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Finally, restart conquery and assert again, that the data is correct.
{
testConquery.shutdown();
// restart
testConquery.beforeAll();
StandaloneSupport conquery2 = testConquery.openDataset(dataset.getId());
log.info("Checking state after re-start");
{
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(2);
for (ShardNode node : conquery2.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId()))
continue;
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isNotEmpty();
}
}
log.info("Executing query after re-import");
// Issue a query and assert that it has the same content as the first time around.
IntegrationUtils.assertQueryResult(conquery2, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
}
}
use of com.bakdata.conquery.models.preproc.outputs.OutputDescription in project conquery by bakdata.
the class LoadingUtil method generateCqpp.
public static List<File> generateCqpp(StandaloneSupport support, Collection<RequiredTable> tables) throws Exception {
List<File> preprocessedFiles = new ArrayList<>();
List<File> descriptions = new ArrayList<>();
for (RequiredTable rTable : tables) {
// copy csv to tmp folder
String name = rTable.getName();
FileUtils.copyInputStreamToFile(rTable.getCsv().stream(), new File(support.getTmpDir(), rTable.getCsv().getName()));
// create import descriptor
final File descriptionFile = support.getTmpDir().toPath().resolve(name + ConqueryConstants.EXTENSION_DESCRIPTION).toFile();
final File outFile = support.getTmpDir().toPath().resolve(name + EXTENSION_PREPROCESSED).toFile();
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(name);
desc.setTable(name);
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(rTable.getPrimaryColumn().createOutput());
input.setSourceFile(rTable.getCsv().getName());
input.setOutput(new OutputDescription[rTable.getColumns().length]);
for (int i = 0; i < rTable.getColumns().length; i++) {
input.getOutput()[i] = rTable.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
descriptions.add(descriptionFile);
preprocessedFiles.add(outFile);
}
// preprocess
support.preprocessTmp(support.getTmpDir(), descriptions);
// clear the MDC location from the preprocessor
ConqueryMDC.clearLocation();
return preprocessedFiles;
}
Aggregations