use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.
the class FilterResolutionExactTest method execute.
@Override
public void execute(StandaloneSupport conquery) throws Exception {
// read test sepcification
String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
DatasetId dataset = conquery.getDataset().getId();
ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
CSVConfig csvConf = conquery.getConfig().getCsv();
test.importRequiredData(conquery);
FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
conquery.waitUntilWorkDone();
Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
Connector connector = concept.getConnectors().iterator().next();
AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");
Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
filter.setSearchType(FilterSearch.FilterSearchType.EXACT);
filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
FilterSearch.createSourceSearch(filter, csvConf);
assertThat(filter.getSourceSearch()).isNotNull();
ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
// from csv
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "aaa", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
// from column values
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
}
use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.
the class FilterResolutionPrefixTest method execute.
@Override
public void execute(StandaloneSupport conquery) throws Exception {
// read test specification
String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
DatasetId dataset = conquery.getDataset().getId();
ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
test.importRequiredData(conquery);
CSVConfig csvConf = conquery.getConfig().getCsv();
FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
conquery.waitUntilWorkDone();
Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
Connector connector = concept.getConnectors().iterator().next();
AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");
Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
filter.setSearchType(FilterSearch.FilterSearchType.PREFIX);
filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
FilterSearch.createSourceSearch(filter, csvConf);
assertThat(filter.getSourceSearch()).isNotNull();
ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
// from csv
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa", "aab");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
// from column values
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
}
use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.
the class Preprocessor method preprocess.
/**
* Apply transformations in descriptor, then write them out to CQPP file for imports.
* <p>
* Reads CSV file, per row extracts the primary key, then applies other transformations on each row, then compresses the data with {@link ColumnStore}.
*/
public static void preprocess(PreprocessingJob preprocessingJob, ProgressBar totalProgress, ConqueryConfig config) throws IOException {
final File preprocessedFile = preprocessingJob.getPreprocessedFile();
TableImportDescriptor descriptor = preprocessingJob.getDescriptor();
// Create temp file that will be moved when finished (we ensure the same file system, to avoid unnecessary copying)
File tmp = new File(preprocessedFile.getParentFile(), preprocessedFile.getName() + ".tmp");
// Ensures deletion on failure
tmp.deleteOnExit();
if (!Files.isWritable(tmp.getParentFile().toPath())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(tmp.getParentFile()));
}
if (!Files.isWritable(preprocessedFile.toPath().getParent())) {
throw new IllegalArgumentException("No write permission in " + LogUtil.printPath(preprocessedFile.toPath().getParent()));
}
// delete target file if it exists
if (preprocessedFile.exists()) {
FileUtils.forceDelete(preprocessedFile);
}
log.info("PREPROCESSING START in {}", preprocessingJob);
int errors = 0;
final Preprocessed result = new Preprocessed(config, preprocessingJob);
long lineId = 0;
// Gather exception classes to get better overview of what kind of errors are happening.
Object2IntMap<Class<? extends Throwable>> exceptions = new Object2IntArrayMap<>();
exceptions.defaultReturnValue(0);
for (int inputSource = 0; inputSource < descriptor.getInputs().length; inputSource++) {
final TableInputDescriptor input = descriptor.getInputs()[inputSource];
final File sourceFile = resolveSourceFile(input.getSourceFile(), preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
final String name = String.format("%s:%s[%d/%s]", descriptor.toString(), descriptor.getTable(), inputSource, sourceFile.getName());
ConqueryMDC.setLocation(name);
if (!(sourceFile.exists() && sourceFile.canRead())) {
throw new FileNotFoundException(sourceFile.getAbsolutePath());
}
CsvParser parser = null;
try (CountingInputStream countingIn = new CountingInputStream(new FileInputStream(sourceFile))) {
long progress = 0;
CSVConfig csvSettings = config.getCsv();
// Create CSV parser according to config, but overriding some behaviour.
parser = csvSettings.withParseHeaders(true).withSkipHeader(false).createParser();
parser.beginParsing(FileUtil.isGZipped(sourceFile) ? new GZIPInputStream(countingIn) : countingIn, csvSettings.getEncoding());
final String[] headers = parser.getContext().parsedHeaders();
final Object2IntArrayMap<String> headerMap = TableInputDescriptor.buildHeaderMap(headers);
// Compile filter.
final GroovyPredicate filter = input.createFilter(headers);
DateReader dateReader = config.getLocale().getDateReader();
final OutputDescription.Output primaryOut = input.getPrimary().createForHeaders(headerMap, dateReader);
final List<OutputDescription.Output> outputs = new ArrayList<>();
final PPColumn[] columns = result.getColumns();
// Instantiate Outputs based on descriptors (apply header positions)
for (OutputDescription op : input.getOutput()) {
outputs.add(op.createForHeaders(headerMap, dateReader));
}
String[] row;
// Read all CSV lines, apply Output transformations and add the to preprocessed.
while ((row = parser.parseNext()) != null) {
// This is explicitly NOT in a try-catch block as scripts may not fail and we should not recover from faulty scripts.
if (filter != null && !filter.filterRow(row)) {
continue;
}
try {
int primaryId = (int) Objects.requireNonNull(primaryOut.createOutput(row, result.getPrimaryColumn(), lineId), "primaryId may not be null");
final int primary = result.addPrimary(primaryId);
final Object[] outRow = applyOutputs(outputs, columns, row, lineId);
result.addRow(primary, columns, outRow);
} catch (OutputDescription.OutputException e) {
exceptions.put(e.getCause().getClass(), exceptions.getInt(e.getCause().getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse `{}` from line: {} content: {}", e.getSource(), lineId, row, e.getCause());
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} catch (Exception e) {
exceptions.put(e.getClass(), exceptions.getInt(e.getClass()) + 1);
errors++;
if (log.isTraceEnabled() || errors < config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("Failed to parse line: {} content: {}", lineId, row, e);
} else if (errors == config.getPreprocessor().getMaximumPrintedErrors()) {
log.warn("More erroneous lines occurred. Only the first " + config.getPreprocessor().getMaximumPrintedErrors() + " were printed.");
}
} finally {
// report progress
totalProgress.addCurrentValue(countingIn.getCount() - progress);
progress = countingIn.getCount();
lineId++;
}
}
} finally {
if (parser != null) {
parser.stopParsing();
}
}
}
if (errors > 0) {
log.warn("File `{}` contained {} faulty lines of ~{} total.", preprocessingJob, errors, lineId);
}
if (log.isWarnEnabled()) {
exceptions.forEach((clazz, count) -> log.warn("Got {} `{}`", count, clazz.getSimpleName()));
}
result.write(tmp);
if (errors > 0) {
log.warn("Had {}% faulty lines ({} of ~{} lines)", String.format("%.2f", 100d * (double) errors / (double) lineId), errors, lineId);
}
if ((double) errors / (double) lineId > config.getPreprocessor().getFaultyLineThreshold()) {
throw new RuntimeException("Too many faulty lines.");
}
// if successful move the tmp file to the target location
FileUtils.moveFile(tmp, preprocessedFile);
log.info("PREPROCESSING DONE in {}", preprocessingJob);
}
use of com.bakdata.conquery.models.config.CSVConfig in project conquery by bakdata.
the class FilterResolutionContainsTest method execute.
@Override
public void execute(StandaloneSupport conquery) throws Exception {
// read test sepcification
String testJson = In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json").withUTF8().readAll();
DatasetId dataset = conquery.getDataset().getId();
ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);
ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));
test.importRequiredData(conquery);
CSVConfig csvConf = conquery.getConfig().getCsv();
FilterSearch.updateSearch(conquery.getNamespace().getNamespaces(), Collections.singleton(conquery.getNamespace().getDataset()), conquery.getDatasetsProcessor().getJobManager(), csvConf);
conquery.waitUntilWorkDone();
Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
Connector connector = concept.getConnectors().iterator().next();
AbstractSelectFilter<?> filter = (AbstractSelectFilter<?>) connector.getFilters().iterator().next();
// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", ".csv");
Out.file(tmpCSv.toFile()).withUTF8().writeLines(lines);
Files.write(tmpCSv, String.join(csvConf.getLineSeparator(), lines).getBytes(), StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
filter.setSearchType(FilterSearch.FilterSearchType.CONTAINS);
filter.setTemplate(new FilterTemplate(tmpCSv.toString(), Arrays.asList("HEADER"), "HEADER", "", ""));
FilterSearch.createSourceSearch(filter, csvConf);
assertThat(filter.getSourceSearch()).isNotNull();
ConceptsProcessor processor = new ConceptsProcessor(conquery.getNamespace().getNamespaces());
// from csv
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("a", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("a", "aaa", "aab", "baaa");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
// from column values
{
ResolvedConceptsResult resolved = processor.resolveFilterValues(filter, List.of("f", "unknown"));
// check the resolved values
assertThat(resolved.getResolvedFilter().getValue().stream().map(FEValue::getValue)).containsExactlyInAnyOrder("f");
assertThat(resolved.getUnknownCodes()).containsExactlyInAnyOrder("unknown");
}
}
Aggregations