use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.
the class JsonReaderTest method testSamplInvalidJSONText.
@Test
public void testSamplInvalidJSONText() throws IOException {
final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
false);
// 2nd row is ill-formed
final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4xxx,\"o\":{\"mg\":2}}\n" + // value of baz is invalid
"{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
// the invalid character in line 2 stops parsing of the 3-line text in a whole
// so the total num of iteration is 1
final int numExpectedIterations = 1;
try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
int numActualIterations = 0;
while (iterator.hasNext()) {
numActualIterations++;
final InputRowListPlusRawValues rawValues = iterator.next();
Assert.assertNotNull(rawValues.getParseException());
}
Assert.assertEquals(numExpectedIterations, numActualIterations);
}
}
use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.
the class AvroOCFReaderTest method testSample.
@Test
public void testSample() throws Exception {
final ObjectMapper mapper = new DefaultObjectMapper();
mapper.setInjectableValues(new InjectableValues.Std().addValue(ObjectMapper.class, mapper));
final InputEntityReader reader = createReader(mapper, null);
try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
Assert.assertTrue(iterator.hasNext());
final InputRowListPlusRawValues row = iterator.next();
Assert.assertFalse(iterator.hasNext());
final Map<String, Object> rawColumns = row.getRawValues();
Assert.assertNotNull(rawColumns);
Assert.assertEquals(20, rawColumns.size());
final List<InputRow> inputRows = row.getInputRows();
Assert.assertNotNull(inputRows);
final InputRow inputRow = Iterables.getOnlyElement(inputRows);
assertInputRow(inputRow);
}
}
use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.
the class InputSourceSampler method sample.
public SamplerResponse sample(final InputSource inputSource, // inputFormat can be null only if inputSource.needsFormat() = false or parser is specified.
@Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig) {
Preconditions.checkNotNull(inputSource, "inputSource required");
if (inputSource.needsFormat()) {
Preconditions.checkNotNull(inputFormat, "inputFormat required");
}
final DataSchema nonNullDataSchema = dataSchema == null ? DEFAULT_DATA_SCHEMA : dataSchema;
final SamplerConfig nonNullSamplerConfig = samplerConfig == null ? SamplerConfig.empty() : samplerConfig;
final Closer closer = Closer.create();
final File tempDir = FileUtils.createTempDir();
closer.register(() -> FileUtils.deleteDirectory(tempDir));
try {
final InputSourceReader reader = buildReader(nonNullSamplerConfig, nonNullDataSchema, inputSource, inputFormat, tempDir);
try (final CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample();
final IncrementalIndex index = buildIncrementalIndex(nonNullSamplerConfig, nonNullDataSchema);
final Closer closer1 = closer) {
List<SamplerResponseRow> responseRows = new ArrayList<>(nonNullSamplerConfig.getNumRows());
int numRowsIndexed = 0;
while (responseRows.size() < nonNullSamplerConfig.getNumRows() && iterator.hasNext()) {
final InputRowListPlusRawValues inputRowListPlusRawValues = iterator.next();
final List<Map<String, Object>> rawColumnsList = inputRowListPlusRawValues.getRawValuesList();
final ParseException parseException = inputRowListPlusRawValues.getParseException();
if (parseException != null) {
if (rawColumnsList != null) {
// add all rows to response
responseRows.addAll(rawColumnsList.stream().map(rawColumns -> new SamplerResponseRow(rawColumns, null, true, parseException.getMessage())).collect(Collectors.toList()));
} else {
// no data parsed, add one response row
responseRows.add(new SamplerResponseRow(null, null, true, parseException.getMessage()));
}
continue;
}
List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
if (inputRows == null) {
continue;
}
for (int i = 0; i < inputRows.size(); i++) {
// InputRowListPlusRawValues guarantees the size of rawColumnsList and inputRows are the same
Map<String, Object> rawColumns = rawColumnsList == null ? null : rawColumnsList.get(i);
InputRow row = inputRows.get(i);
// keep the index of the row to be added to responseRows for further use
final int rowIndex = responseRows.size();
IncrementalIndexAddResult addResult = index.add(new SamplerInputRow(row, rowIndex), true);
if (addResult.hasParseException()) {
responseRows.add(new SamplerResponseRow(rawColumns, null, true, addResult.getParseException().getMessage()));
} else {
// store the raw value; will be merged with the data from the IncrementalIndex later
responseRows.add(new SamplerResponseRow(rawColumns, null, null, null));
numRowsIndexed++;
}
}
}
final List<String> columnNames = index.getColumnNames();
columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
for (Row row : index) {
Map<String, Object> parsed = new LinkedHashMap<>();
parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch());
columnNames.forEach(k -> parsed.put(k, row.getRaw(k)));
Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
if (sortKey != null) {
responseRows.set(sortKey.intValue(), responseRows.get(sortKey.intValue()).withParsed(parsed));
}
}
// make sure size of responseRows meets the input
if (responseRows.size() > nonNullSamplerConfig.getNumRows()) {
responseRows = responseRows.subList(0, nonNullSamplerConfig.getNumRows());
}
int numRowsRead = responseRows.size();
return new SamplerResponse(numRowsRead, numRowsIndexed, responseRows.stream().filter(Objects::nonNull).filter(x -> x.getParsed() != null || x.isUnparseable() != null).collect(Collectors.toList()));
}
} catch (Exception e) {
throw new SamplerException(e, "Failed to sample data: %s", e.getMessage());
}
}
use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.
the class Transformer method transform.
@Nullable
public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawValues row) {
if (row == null) {
return null;
}
final InputRowListPlusRawValues inputRowListPlusRawValues;
if (transforms.isEmpty() || row.getInputRows() == null) {
inputRowListPlusRawValues = row;
} else {
final List<InputRow> originalRows = row.getInputRows();
final List<InputRow> transformedRows = new ArrayList<>(originalRows.size());
for (InputRow originalRow : originalRows) {
transformedRows.add(new TransformedInputRow(originalRow, transforms));
}
inputRowListPlusRawValues = InputRowListPlusRawValues.ofList(row.getRawValuesList(), transformedRows);
}
if (valueMatcher != null) {
if (inputRowListPlusRawValues.getInputRows() != null) {
// size of inputRows and rawValues are the same
int size = inputRowListPlusRawValues.getInputRows().size();
final List<InputRow> matchedRows = new ArrayList<>(size);
final List<Map<String, Object>> matchedVals = new ArrayList<>(size);
final List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
final List<Map<String, Object>> inputVals = inputRowListPlusRawValues.getRawValuesList();
for (int i = 0; i < size; i++) {
rowSupplierForValueMatcher.set(inputRows.get(i));
if (valueMatcher.matches()) {
matchedRows.add(inputRows.get(i));
matchedVals.add(inputVals.get(i));
}
}
return InputRowListPlusRawValues.ofList(matchedVals, matchedRows);
}
}
return inputRowListPlusRawValues;
}
Aggregations