Search in sources :

Example 1 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class CompatParquetInputTest method testReadNestedArrayStruct.

@Test
public void testReadNestedArrayStruct() throws IOException, InterruptedException {
    // }
    if (parserType.equals(ParquetExtensionsModule.PARQUET_AVRO_INPUT_PARSER_TYPE)) {
        return;
    }
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/compat/nested_array_struct.json", parserType, true);
    config.intoConfiguration(job);
    List<InputRow> rows = getAllRows(parserType, config);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("5", rows.get(0).getDimension("primitive").get(0));
    Assert.assertEquals("4", rows.get(0).getDimension("extracted1").get(0));
    Assert.assertEquals("6", rows.get(0).getDimension("extracted2").get(0));
}
Also used : InputRow(org.apache.druid.data.input.InputRow) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 2 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class CompatParquetInputTest method testParquetThriftCompat.

@Test
public void testParquetThriftCompat() throws IOException, InterruptedException {
    // Map key type must be binary (UTF8): required int32 key
    if (parserType.equals(ParquetExtensionsModule.PARQUET_AVRO_INPUT_PARSER_TYPE)) {
        return;
    }
    /*
      message ParquetSchema {
        required boolean boolColumn;
        required int32 byteColumn;
        required int32 shortColumn;
        required int32 intColumn;
        required int64 longColumn;
        required double doubleColumn;
        required binary binaryColumn (UTF8);
        required binary stringColumn (UTF8);
        required binary enumColumn (ENUM);
        optional boolean maybeBoolColumn;
        optional int32 maybeByteColumn;
        optional int32 maybeShortColumn;
        optional int32 maybeIntColumn;
        optional int64 maybeLongColumn;
        optional double maybeDoubleColumn;
        optional binary maybeBinaryColumn (UTF8);
        optional binary maybeStringColumn (UTF8);
        optional binary maybeEnumColumn (ENUM);
        required group stringsColumn (LIST) {
          repeated binary stringsColumn_tuple (UTF8);
        }
        required group intSetColumn (LIST) {
          repeated int32 intSetColumn_tuple;
        }
        required group intToStringColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional binary value (UTF8);
          }
        }
        required group complexColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional group value (LIST) {
              repeated group value_tuple {
                required group nestedIntsColumn (LIST) {
                  repeated int32 nestedIntsColumn_tuple;
                }
                required binary nestedStringColumn (UTF8);
              }
            }
          }
        }
      }
     */
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/compat/parquet_thrift_compat.json", parserType, true);
    config.intoConfiguration(job);
    Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("true", rows.get(0).getDimension("boolColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("byteColumn").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("shortColumn").get(0));
    Assert.assertEquals("2", rows.get(0).getDimension("intColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("longColumn").get(0));
    Assert.assertEquals("0.2", rows.get(0).getDimension("doubleColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("binaryColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("stringColumn").get(0));
    Assert.assertEquals("SPADES", rows.get(0).getDimension("enumColumn").get(0));
    Assert.assertTrue(rows.get(0).getDimension("maybeBoolColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeByteColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeShortColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeIntColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeLongColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeDoubleColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeBinaryColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeStringColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeEnumColumn").isEmpty());
    Assert.assertEquals("arr_0", rows.get(0).getDimension("stringsColumn").get(0));
    Assert.assertEquals("arr_1", rows.get(0).getDimension("stringsColumn").get(1));
    Assert.assertEquals("0", rows.get(0).getDimension("intSetColumn").get(0));
    Assert.assertEquals("val_1", rows.get(0).getDimension("extractByLogicalMap").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("extractByComplexLogicalMap").get(0));
}
Also used : InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 3 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class CompatParquetInputTest method testProtoStructWithArray.

@Test
public void testProtoStructWithArray() throws IOException, InterruptedException {
    // "REPEATED not supported outside LIST or MAP. Type: repeated int32 repeatedPrimitive"
    if (parserType.equals(ParquetExtensionsModule.PARQUET_AVRO_INPUT_PARSER_TYPE)) {
        return;
    }
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/compat/proto_struct_with_array.json", parserType, true);
    config.intoConfiguration(job);
    List<InputRow> rows = getAllRows(parserType, config);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("10", rows.get(0).getDimension("optionalPrimitive").get(0));
    Assert.assertEquals("9", rows.get(0).getDimension("requiredPrimitive").get(0));
    Assert.assertTrue(rows.get(0).getDimension("repeatedPrimitive").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("extractedOptional").isEmpty());
    Assert.assertEquals("9", rows.get(0).getDimension("extractedRequired").get(0));
    Assert.assertEquals("9", rows.get(0).getDimension("extractedRepeated").get(0));
    Assert.assertEquals("10", rows.get(0).getDimension("extractedRepeated").get(1));
}
Also used : InputRow(org.apache.druid.data.input.InputRow) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 4 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class CompatParquetInputTest method testParquet1217.

@Test
public void testParquet1217() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/compat/parquet_1217.json", parserType, true);
    config.intoConfiguration(job);
    Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    List<InputRow> rows2 = getAllRows(parserType, config);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("-1", rows.get(0).getDimension("col").get(0));
    Assert.assertEquals(-1, rows.get(0).getMetric("metric1"));
    Assert.assertTrue(rows2.get(2).getDimension("col").isEmpty());
}
Also used : InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 5 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DecimalParquetInputTest method testReadParquetDecimali64.

@Test
public void testReadParquetDecimali64() throws IOException, InterruptedException {
    // parquet-avro does not correctly convert decimal types
    if (parserType.equals(ParquetExtensionsModule.PARQUET_AVRO_INPUT_PARSER_TYPE)) {
        return;
    }
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/decimals/dec_in_i64.json", parserType, true);
    List<InputRow> rows = getAllRows(parserType, config);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("100", rows.get(0).getDimension("i64_dec").get(0));
    Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
}
Also used : InputRow(org.apache.druid.data.input.InputRow) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) BigDecimal(java.math.BigDecimal) Test(org.junit.Test)

Aggregations

HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)34 Test (org.junit.Test)32 InputRow (org.apache.druid.data.input.InputRow)27 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 Job (org.apache.hadoop.mapreduce.Job)9 ArrayList (java.util.ArrayList)8 Configuration (org.apache.hadoop.conf.Configuration)8 OrcStruct (org.apache.orc.mapred.OrcStruct)6 BigDecimal (java.math.BigDecimal)3 HadoopIngestionSpec (org.apache.druid.indexer.HadoopIngestionSpec)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)2 ParseSpec (org.apache.druid.data.input.impl.ParseSpec)2 Bucket (org.apache.druid.indexer.Bucket)2 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)2 DataSegment (org.apache.druid.timeline.DataSegment)2 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)2 Path (org.apache.hadoop.fs.Path)2