use of com.bakdata.conquery.models.preproc.TableInputDescriptor in project conquery by bakdata.
the class ImportUpdateTest method execute.
@Override
public void execute(String name, TestConquery testConquery) throws Exception {
final StandaloneSupport conquery = testConquery.getSupport(name);
MetaStorage storage = conquery.getMetaStorage();
String testJson = In.resource("/tests/query/UPDATE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.json").withUTF8().readAll();
final Dataset dataset = conquery.getDataset();
final Namespace namespace = conquery.getNamespace();
final ImportId importId1 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table1", "table1");
final ImportId importId2 = ImportId.Parser.INSTANCE.parse(dataset.getName(), "table2", "table2");
QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
final List<RequiredTable> tables = test.getContent().getTables();
assertThat(tables.size()).isEqualTo(2);
List<File> cqpps;
// Manually import data, so we can do our own work.
{
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
importSecondaryIds(conquery, test.getContent().getSecondaryIds());
conquery.waitUntilWorkDone();
LoadingUtil.importTables(conquery, tables);
conquery.waitUntilWorkDone();
LoadingUtil.importConcepts(conquery, test.getRawConcepts());
conquery.waitUntilWorkDone();
cqpps = LoadingUtil.generateCqpp(conquery, tables);
conquery.waitUntilWorkDone();
assertThat(cqpps.size()).isEqualTo(tables.size());
LoadingUtil.importCqppFiles(conquery, List.of(cqpps.get(0)));
conquery.waitUntilWorkDone();
}
final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
// State before update.
{
log.info("Checking state before update");
assertThat(namespace.getStorage().getAllImports()).hasSize(1);
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId1)).isNotNull();
}
}
assertThat(namespace.getNumberOfEntities()).isEqualTo(4);
// assert that the query can be executed after the import
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Try to update an import that does not exist should throw a Not-Found Webapplication Exception
LoadingUtil.updateCqppFile(conquery, cqpps.get(1), Response.Status.Family.CLIENT_ERROR, "Not Found");
conquery.waitUntilWorkDone();
// Load manually new data for import and update the concerned import
{
log.info("Manually loading new data for import");
final RequiredTable importTable = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId1.getTable().getTable())).findFirst().orElseThrow();
final String csvName = importTable.getCsv().getName();
final String path = importTable.getCsv().getPath();
// copy new content of the importTable into the csv-File used by the preprocessor to avoid creating multiple files withe same names
FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + csvName.replace(".csv", ".update.csv")).asStream(), new File(conquery.getTmpDir(), csvName));
File descriptionFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
File newPreprocessedFile = new File(conquery.getTmpDir(), importTable.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
// create import descriptor
{
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(importTable.getName());
desc.setTable(importTable.getName());
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(importTable.getPrimaryColumn().createOutput());
input.setSourceFile(csvName);
input.setOutput(new OutputDescription[importTable.getColumns().length]);
for (int i = 0; i < importTable.getColumns().length; i++) {
input.getOutput()[i] = importTable.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
}
// preprocess
conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
log.info("updating import");
// correct update of the import
LoadingUtil.updateCqppFile(conquery, newPreprocessedFile, Response.Status.Family.SUCCESSFUL, "No Content");
conquery.waitUntilWorkDone();
}
// State after update.
{
log.info("Checking state after update");
assertThat(namespace.getStorage().getAllImports()).hasSize(1);
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId1)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId1)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId1)).isNotNull();
}
}
assertThat(namespace.getNumberOfEntities()).isEqualTo(9);
// Issue a query and assert that it has more content.
IntegrationUtils.assertQueryResult(conquery, query, 4L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
}
use of com.bakdata.conquery.models.preproc.TableInputDescriptor in project conquery by bakdata.
the class PreprocessorCommand method run.
@Override
protected void run(Environment environment, Namespace namespace, ConqueryConfig config) throws Exception {
if (pool == null) {
pool = Executors.newFixedThreadPool(config.getPreprocessor().getNThreads());
}
// Tag if present is appended to input-file csvs, output-file cqpp and used as id of cqpps
isFailFast = Optional.ofNullable(namespace.getBoolean("fast-fail")).orElse(false);
isStrict = Optional.ofNullable(namespace.getBoolean("strict")).orElse(true);
final List<String> tags = namespace.<String>getList("tag");
final File inDir = namespace.get("in");
final File outDir = namespace.get("out");
final List<File> descriptionFiles = namespace.<File>getList("desc");
log.info("Preprocessing from command line config.");
final Collection<PreprocessingJob> jobs = new ArrayList<>();
if (tags == null || tags.isEmpty()) {
for (File desc : descriptionFiles) {
final List<PreprocessingJob> descriptions = findPreprocessingDescriptions(desc, inDir, outDir, Optional.empty(), environment.getValidator());
jobs.addAll(descriptions);
}
} else {
for (String tag : tags) {
for (File desc : descriptionFiles) {
final List<PreprocessingJob> jobDescriptions = findPreprocessingDescriptions(desc, inDir, outDir, Optional.of(tag), environment.getValidator());
jobs.addAll(jobDescriptions);
}
}
}
List<PreprocessingJob> missing = new ArrayList<>();
for (PreprocessingJob job : jobs) {
for (TableInputDescriptor input : job.getDescriptor().getInputs()) {
final File sourceFile = Preprocessor.resolveSourceFile(input.getSourceFile(), job.getCsvDirectory(), job.getTag());
if (!sourceFile.exists()) {
log.error("Did not find file `{}` for Preprocessing[{}].", sourceFile, job);
missing.add(job);
}
}
}
// This will halt preprocessing immediately.
if (isStrict && !missing.isEmpty()) {
log.error("FAILED Preprocessing, files are missing.");
doFail();
}
jobs.removeIf(Predicate.not(PreprocessorCommand::requiresProcessing));
final long totalSize = jobs.stream().mapToLong(PreprocessingJob::estimateTotalCsvSizeBytes).sum();
log.info("Required to preprocess {} in total", BinaryByteUnit.format(totalSize));
ProgressBar totalProgress = new ProgressBar(totalSize, System.out);
for (PreprocessingJob job : jobs) {
pool.submit(() -> {
ConqueryMDC.setLocation(job.toString());
try {
Preprocessor.preprocess(job, totalProgress, config);
success.add(job.toString());
} catch (FileNotFoundException e) {
log.warn("Did not find file `{}` for preprocessing.", e.getMessage());
addMissing(job);
} catch (Exception e) {
log.error("Failed to preprocess " + LogUtil.printPath(job.getDescriptionFile()), e);
addFailed(job);
}
});
}
pool.shutdown();
pool.awaitTermination(24, TimeUnit.HOURS);
ConqueryMDC.clearLocation();
if (!success.isEmpty()) {
log.info("Successfully Preprocess {} Jobs:", success.size());
success.forEach(desc -> log.info("\tSucceeded Preprocessing for {}", desc));
}
if (!missing.isEmpty()) {
log.warn("Did not find {} Files", missing.size());
missing.forEach(desc -> log.warn("\tDid not find file for {}", desc));
}
if (isFailed()) {
log.error("Failed {} Preprocessing Jobs:", failed.size());
failed.forEach(desc -> log.error("\tFailed Preprocessing for {}", desc));
doFail();
}
}
use of com.bakdata.conquery.models.preproc.TableInputDescriptor in project conquery by bakdata.
the class ImportDeletionTest method execute.
@Override
public void execute(String name, TestConquery testConquery) throws Exception {
final StandaloneSupport conquery = testConquery.getSupport(name);
MetaStorage storage = conquery.getMetaStorage();
final String testJson = In.resource("/tests/query/DELETE_IMPORT_TESTS/SIMPLE_TREECONCEPT_Query.test.json").withUTF8().readAll();
final Dataset dataset = conquery.getDataset();
final Namespace namespace = conquery.getNamespace();
final ImportId importId = ImportId.Parser.INSTANCE.parse(dataset.getName(), "test_table2", "test_table2");
final QueryTest test = (QueryTest) JsonIntegrationTest.readJson(dataset, testJson);
// Manually import data, so we can do our own work.
{
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
importSecondaryIds(conquery, test.getContent().getSecondaryIds());
conquery.waitUntilWorkDone();
LoadingUtil.importTables(conquery, test.getContent().getTables());
conquery.waitUntilWorkDone();
LoadingUtil.importConcepts(conquery, test.getRawConcepts());
conquery.waitUntilWorkDone();
LoadingUtil.importTableContents(conquery, test.getContent().getTables());
conquery.waitUntilWorkDone();
}
final Query query = IntegrationUtils.parseQuery(conquery, test.getRawQuery());
final int nImports = namespace.getStorage().getAllImports().size();
// State before deletion.
{
log.info("Checking state before deletion");
// Must contain the import.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isNotEmpty();
assertThat(namespace.getStorage().getCentralRegistry().getOptional(importId)).isNotEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(block -> block.getBucket().getId().getDataset().equals(dataset.getId())).isNotEmpty();
assertThat(workerStorage.getAllBuckets()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).describedAs("Buckets for Worker %s", worker.getInfo().getId()).isNotEmpty();
// Must contain the import.
assertThat(workerStorage.getImport(importId)).isNotNull();
}
}
log.info("Executing query before deletion");
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Delete the import.
{
log.info("Issuing deletion of import {}", importId);
final URI deleteImportUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder(), AdminTablesResource.class, "deleteImport").buildFromMap(Map.of(ResourceConstants.DATASET, conquery.getDataset().getId(), ResourceConstants.TABLE, importId.getTable(), ResourceConstants.IMPORT_ID, importId));
final Response delete = conquery.getClient().target(deleteImportUri).request(MediaType.APPLICATION_JSON).delete();
assertThat(delete.getStatusInfo().getFamily()).isEqualTo(Response.Status.Family.SUCCESSFUL);
conquery.waitUntilWorkDone();
}
// State after deletion.
{
log.info("Checking state after deletion");
// We have deleted an import now there should be one less!
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports - 1);
// The deleted import should not be found.
assertThat(namespace.getStorage().getAllImports()).filteredOn(imp -> imp.getId().equals(importId)).isEmpty();
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
// No bucket should be found referencing the import.
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isEmpty();
// No CBlock associated with import may exist
assertThat(workerStorage.getAllCBlocks()).describedAs("CBlocks for Worker %s", worker.getInfo().getId()).filteredOn(cBlock -> cBlock.getBucket().getId().getImp().equals(importId)).isEmpty();
// Import should not exists anymore
assertThat(workerStorage.getImport(importId)).describedAs("Import for Worker %s", worker.getInfo().getId()).isNull();
}
}
log.info("Executing query after deletion");
// Issue a query and assert that it has less content.
IntegrationUtils.assertQueryResult(conquery, query, 1L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
conquery.waitUntilWorkDone();
// Load more data under the same name into the same table, with only the deleted import/table
{
// only import the deleted import/table
final RequiredTable import2Table = test.getContent().getTables().stream().filter(table -> table.getName().equalsIgnoreCase(importId.getTable().getTable())).findFirst().orElseThrow();
final ResourceFile csv = import2Table.getCsv();
final String path = csv.getPath();
// copy csv to tmp folder
// Content 2.2 contains an extra entry of a value that hasn't been seen before.
FileUtils.copyInputStreamToFile(In.resource(path.substring(0, path.lastIndexOf('/')) + "/" + "content2.2.csv").asStream(), new File(conquery.getTmpDir(), csv.getName()));
File descriptionFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_DESCRIPTION);
File preprocessedFile = new File(conquery.getTmpDir(), import2Table.getName() + ConqueryConstants.EXTENSION_PREPROCESSED);
// create import descriptor
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(import2Table.getName());
desc.setTable(import2Table.getName());
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(import2Table.getPrimaryColumn().createOutput());
input.setSourceFile(import2Table.getCsv().getName());
input.setOutput(new OutputDescription[import2Table.getColumns().length]);
for (int i = 0; i < import2Table.getColumns().length; i++) {
input.getOutput()[i] = import2Table.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
// preprocess
conquery.preprocessTmp(conquery.getTmpDir(), List.of(descriptionFile));
// import preprocessedFiles
conquery.getDatasetsProcessor().addImport(conquery.getNamespace(), new GZIPInputStream(new FileInputStream(preprocessedFile)));
conquery.waitUntilWorkDone();
}
// State after reimport.
{
log.info("Checking state after re-import");
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(nImports);
for (ShardNode node : conquery.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId())) {
continue;
}
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).isNotEmpty();
}
}
log.info("Executing query after re-import");
// Issue a query and assert that it has the same content as the first time around.
IntegrationUtils.assertQueryResult(conquery, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
// Finally, restart conquery and assert again, that the data is correct.
{
testConquery.shutdown();
// restart
testConquery.beforeAll();
StandaloneSupport conquery2 = testConquery.openDataset(dataset.getId());
log.info("Checking state after re-start");
{
assertThat(namespace.getStorage().getAllImports().size()).isEqualTo(2);
for (ShardNode node : conquery2.getShardNodes()) {
for (Worker worker : node.getWorkers().getWorkers().values()) {
if (!worker.getInfo().getDataset().equals(dataset.getId()))
continue;
final ModificationShieldedWorkerStorage workerStorage = worker.getStorage();
assertThat(workerStorage.getAllBuckets()).describedAs("Buckets for Worker %s", worker.getInfo().getId()).filteredOn(bucket -> bucket.getId().getDataset().equals(dataset.getId())).filteredOn(bucket -> bucket.getImp().getId().equals(importId)).isNotEmpty();
}
}
log.info("Executing query after re-import");
// Issue a query and assert that it has the same content as the first time around.
IntegrationUtils.assertQueryResult(conquery2, query, 2L, ExecutionState.DONE, conquery.getTestUser(), 201);
}
}
}
use of com.bakdata.conquery.models.preproc.TableInputDescriptor in project conquery by bakdata.
the class LoadingUtil method generateCqpp.
public static List<File> generateCqpp(StandaloneSupport support, Collection<RequiredTable> tables) throws Exception {
List<File> preprocessedFiles = new ArrayList<>();
List<File> descriptions = new ArrayList<>();
for (RequiredTable rTable : tables) {
// copy csv to tmp folder
String name = rTable.getName();
FileUtils.copyInputStreamToFile(rTable.getCsv().stream(), new File(support.getTmpDir(), rTable.getCsv().getName()));
// create import descriptor
final File descriptionFile = support.getTmpDir().toPath().resolve(name + ConqueryConstants.EXTENSION_DESCRIPTION).toFile();
final File outFile = support.getTmpDir().toPath().resolve(name + EXTENSION_PREPROCESSED).toFile();
TableImportDescriptor desc = new TableImportDescriptor();
desc.setName(name);
desc.setTable(name);
TableInputDescriptor input = new TableInputDescriptor();
{
input.setPrimary(rTable.getPrimaryColumn().createOutput());
input.setSourceFile(rTable.getCsv().getName());
input.setOutput(new OutputDescription[rTable.getColumns().length]);
for (int i = 0; i < rTable.getColumns().length; i++) {
input.getOutput()[i] = rTable.getColumns()[i].createOutput();
}
}
desc.setInputs(new TableInputDescriptor[] { input });
Jackson.MAPPER.writeValue(descriptionFile, desc);
descriptions.add(descriptionFile);
preprocessedFiles.add(outFile);
}
// preprocess
support.preprocessTmp(support.getTmpDir(), descriptions);
// clear the MDC location from the preprocessor
ConqueryMDC.clearLocation();
return preprocessedFiles;
}
Aggregations