use of io.druid.data.input.impl.InputRowParser in project hive by apache.
the class TestDruidRecordWriter method testWrite.
// This test need this patch https://github.com/druid-io/druid/pull/3483
@Ignore
@Test
public void testWrite() throws IOException, SegmentLoadingException {
final String dataSourceName = "testDataSource";
final File segmentOutputDir = temporaryFolder.newFolder();
final File workingDir = temporaryFolder.newFolder();
Configuration config = new Configuration();
final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(ImmutableList.<DimensionSchema>of(new StringDimensionSchema("host")), null, null)));
final Map<String, Object> parserMap = objectMapper.convertValue(inputRowParser, Map.class);
DataSchema dataSchema = new DataSchema(dataSourceName, parserMap, new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_sum"), new HyperUniquesAggregatorFactory("unique_hosts", "unique_hosts") }, new UniformGranularitySpec(Granularity.DAY, QueryGranularities.NONE, ImmutableList.of(INTERVAL_FULL)), objectMapper);
RealtimeTuningConfig tuningConfig = RealtimeTuningConfig.makeDefaultTuningConfig(temporaryFolder.newFolder());
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
DataSegmentPusher dataSegmentPusher = new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig() {
@Override
public File getStorageDirectory() {
return segmentOutputDir;
}
}, objectMapper);
Path segmentDescriptroPath = new Path(workingDir.getAbsolutePath(), DruidStorageHandler.SEGMENTS_DESCRIPTOR_DIR_NAME);
druidRecordWriter = new DruidRecordWriter(dataSchema, tuningConfig, dataSegmentPusher, 20, segmentDescriptroPath, localFileSystem);
List<DruidWritable> druidWritables = Lists.transform(expectedRows, new Function<ImmutableMap<String, Object>, DruidWritable>() {
@Nullable
@Override
public DruidWritable apply(@Nullable ImmutableMap<String, Object> input) {
return new DruidWritable(ImmutableMap.<String, Object>builder().putAll(input).put(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME, Granularity.DAY.truncate(new DateTime((long) input.get(DruidTable.DEFAULT_TIMESTAMP_COLUMN))).getMillis()).build());
}
});
for (DruidWritable druidWritable : druidWritables) {
druidRecordWriter.write(druidWritable);
}
druidRecordWriter.close(false);
List<DataSegment> dataSegmentList = DruidStorageHandlerUtils.getPublishedSegments(segmentDescriptroPath, config);
Assert.assertEquals(1, dataSegmentList.size());
File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
new LocalDataSegmentPuller().getSegmentFiles(dataSegmentList.get(0), tmpUnzippedSegmentDir);
final QueryableIndex queryableIndex = DruidStorageHandlerUtils.INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(queryableIndex);
Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, adapter.getInterval())), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null, QueryGranularities.NONE);
List<InputRow> rows = Lists.newArrayList();
while (firehose.hasMore()) {
rows.add(firehose.nextRow());
}
verifyRows(expectedRows, rows);
}
use of io.druid.data.input.impl.InputRowParser in project druid by druid-io.
the class DataSchema method getParser.
@JsonIgnore
public InputRowParser getParser() {
if (parser == null) {
log.warn("No parser has been specified");
return null;
}
final InputRowParser inputRowParser = jsonMapper.convertValue(this.parser, InputRowParser.class);
final Set<String> dimensionExclusions = Sets.newHashSet();
for (AggregatorFactory aggregator : aggregators) {
dimensionExclusions.addAll(aggregator.requiredFields());
dimensionExclusions.add(aggregator.getName());
}
if (inputRowParser.getParseSpec() != null) {
final DimensionsSpec dimensionsSpec = inputRowParser.getParseSpec().getDimensionsSpec();
final TimestampSpec timestampSpec = inputRowParser.getParseSpec().getTimestampSpec();
// exclude timestamp from dimensions by default, unless explicitly included in the list of dimensions
if (timestampSpec != null) {
final String timestampColumn = timestampSpec.getTimestampColumn();
if (!(dimensionsSpec.hasCustomDimensions() && dimensionsSpec.getDimensionNames().contains(timestampColumn))) {
dimensionExclusions.add(timestampColumn);
}
}
if (dimensionsSpec != null) {
final Set<String> metSet = Sets.newHashSet();
for (AggregatorFactory aggregator : aggregators) {
metSet.add(aggregator.getName());
}
final Set<String> dimSet = Sets.newHashSet(dimensionsSpec.getDimensionNames());
final Set<String> overlap = Sets.intersection(metSet, dimSet);
if (!overlap.isEmpty()) {
throw new IAE("Cannot have overlapping dimensions and metrics of the same name. Please change the name of the metric. Overlap: %s", overlap);
}
return inputRowParser.withParseSpec(inputRowParser.getParseSpec().withDimensionsSpec(dimensionsSpec.withDimensionExclusions(Sets.difference(dimensionExclusions, dimSet))));
} else {
return inputRowParser;
}
} else {
log.warn("No parseSpec in parser has been specified.");
return inputRowParser;
}
}
use of io.druid.data.input.impl.InputRowParser in project druid by druid-io.
the class RealtimeManagerTest method setUp.
@Before
public void setUp() throws Exception {
final List<TestInputRowHolder> rows = Arrays.asList(makeRow(new DateTime("9000-01-01").getMillis()), makeRow(new ParseException("parse error")), null, makeRow(new DateTime().getMillis()));
ObjectMapper jsonMapper = new DefaultObjectMapper();
schema = new DataSchema("test", null, new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null), jsonMapper);
schema2 = new DataSchema("testV2", null, new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null), jsonMapper);
RealtimeIOConfig ioConfig = new RealtimeIOConfig(new FirehoseFactory() {
@Override
public Firehose connect(InputRowParser parser) throws IOException {
return new TestFirehose(rows.iterator());
}
}, new PlumberSchool() {
@Override
public Plumber findPlumber(DataSchema schema, RealtimeTuningConfig config, FireDepartmentMetrics metrics) {
return plumber;
}
}, null);
RealtimeIOConfig ioConfig2 = new RealtimeIOConfig(null, new PlumberSchool() {
@Override
public Plumber findPlumber(DataSchema schema, RealtimeTuningConfig config, FireDepartmentMetrics metrics) {
return plumber2;
}
}, new FirehoseFactoryV2() {
@Override
public FirehoseV2 connect(InputRowParser parser, Object arg1) throws IOException, ParseException {
return new TestFirehoseV2(rows.iterator());
}
});
RealtimeTuningConfig tuningConfig = new RealtimeTuningConfig(1, new Period("P1Y"), null, null, null, null, null, null, null, null, 0, 0, null, null);
plumber = new TestPlumber(new Sink(new Interval("0/P5000Y"), schema, tuningConfig.getShardSpec(), new DateTime().toString(), tuningConfig.getMaxRowsInMemory(), tuningConfig.isReportParseExceptions()));
realtimeManager = new RealtimeManager(Arrays.<FireDepartment>asList(new FireDepartment(schema, ioConfig, tuningConfig)), null);
plumber2 = new TestPlumber(new Sink(new Interval("0/P5000Y"), schema2, tuningConfig.getShardSpec(), new DateTime().toString(), tuningConfig.getMaxRowsInMemory(), tuningConfig.isReportParseExceptions()));
realtimeManager2 = new RealtimeManager(Arrays.<FireDepartment>asList(new FireDepartment(schema2, ioConfig2, tuningConfig)), null);
tuningConfig_0 = new RealtimeTuningConfig(1, new Period("P1Y"), null, null, null, null, null, new LinearShardSpec(0), null, null, 0, 0, null, null);
tuningConfig_1 = new RealtimeTuningConfig(1, new Period("P1Y"), null, null, null, null, null, new LinearShardSpec(1), null, null, 0, 0, null, null);
schema3 = new DataSchema("testing", null, new AggregatorFactory[] { new CountAggregatorFactory("ignore") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null), jsonMapper);
FireDepartment department_0 = new FireDepartment(schema3, ioConfig, tuningConfig_0);
FireDepartment department_1 = new FireDepartment(schema3, ioConfig2, tuningConfig_1);
QueryRunnerFactoryConglomerate conglomerate = new QueryRunnerFactoryConglomerate() {
@Override
public <T, QueryType extends Query<T>> QueryRunnerFactory<T, QueryType> findFactory(QueryType query) {
return factory;
}
};
chiefStartedLatch = new CountDownLatch(2);
RealtimeManager.FireChief fireChief_0 = new RealtimeManager.FireChief(department_0, conglomerate) {
@Override
public void run() {
super.initPlumber();
chiefStartedLatch.countDown();
}
};
RealtimeManager.FireChief fireChief_1 = new RealtimeManager.FireChief(department_1, conglomerate) {
@Override
public void run() {
super.initPlumber();
chiefStartedLatch.countDown();
}
};
realtimeManager3 = new RealtimeManager(Arrays.asList(department_0, department_1), conglomerate, ImmutableMap.<String, Map<Integer, RealtimeManager.FireChief>>of("testing", ImmutableMap.of(0, fireChief_0, 1, fireChief_1)));
startFireChiefWithPartitionNum(fireChief_0, 0);
startFireChiefWithPartitionNum(fireChief_1, 1);
}
use of io.druid.data.input.impl.InputRowParser in project druid by druid-io.
the class HadoopyStringInputRowParserTest method testSerde.
@Test
public void testSerde() throws Exception {
String jsonStr = "{" + "\"type\":\"hadoopyString\"," + "\"parseSpec\":{\"format\":\"json\",\"timestampSpec\":{\"column\":\"xXx\"},\"dimensionsSpec\":{}}" + "}";
ObjectMapper jsonMapper = HadoopDruidIndexerConfig.JSON_MAPPER;
InputRowParser parser = jsonMapper.readValue(jsonMapper.writeValueAsString(jsonMapper.readValue(jsonStr, InputRowParser.class)), InputRowParser.class);
Assert.assertTrue(parser instanceof HadoopyStringInputRowParser);
Assert.assertEquals("xXx", parser.getParseSpec().getTimestampSpec().getTimestampColumn());
}
use of io.druid.data.input.impl.InputRowParser in project hive by apache.
the class DruidOutputFormat method getHiveRecordWriter.
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
final String segmentGranularity = tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) : HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_INDEXING_GRANULARITY);
final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
final String segmentDirectory = tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(jc, HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY);
final HdfsDataSegmentPusherConfig hdfsDataSegmentPusherConfig = new HdfsDataSegmentPusherConfig();
hdfsDataSegmentPusherConfig.setStorageDirectory(segmentDirectory);
final DataSegmentPusher hdfsDataSegmentPusher = new HdfsDataSegmentPusher(hdfsDataSegmentPusherConfig, jc, DruidStorageHandlerUtils.JSON_MAPPER);
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularity.valueOf(segmentGranularity), QueryGranularity.fromString(tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY) == null ? "NONE" : tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY)), null);
final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
}
ArrayList<String> columnNames = new ArrayList<String>();
for (String name : columnNameProperty.split(",")) {
columnNames.add(name);
}
if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalStateException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
}
ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
// Default, all columns that are not metrics or timestamp, are treated as dimensions
final List<DimensionSchema> dimensions = new ArrayList<>();
ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder();
for (int i = 0; i < columnTypes.size(); i++) {
PrimitiveTypeInfo f = (PrimitiveTypeInfo) columnTypes.get(i);
AggregatorFactory af;
switch(f.getPrimitiveCategory()) {
case BYTE:
case SHORT:
case INT:
case LONG:
af = new LongSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case FLOAT:
case DOUBLE:
case DECIMAL:
af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case TIMESTAMP:
String tColumnName = columnNames.get(i);
if (!tColumnName.equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN) && !tColumnName.equals(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME)) {
throw new IOException("Dimension " + tColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
}
continue;
default:
// Dimension
String dColumnName = columnNames.get(i);
if (PrimitiveObjectInspectorUtils.getPrimitiveGrouping(f.getPrimitiveCategory()) != PrimitiveGrouping.STRING_GROUP) {
throw new IOException("Dimension " + dColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
}
dimensions.add(new StringDimensionSchema(dColumnName));
continue;
}
aggregatorFactoryBuilder.add(af);
}
List<AggregatorFactory> aggregatorFactories = aggregatorFactoryBuilder.build();
final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensions, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME), null)));
Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, Map.class);
final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, aggregatorFactories.toArray(new AggregatorFactory[aggregatorFactories.size()]), granularitySpec, DruidStorageHandlerUtils.JSON_MAPPER);
final String workingPath = jc.get(Constants.DRUID_JOB_WORKING_DIRECTORY);
final String version = jc.get(Constants.DRUID_SEGMENT_VERSION);
Integer maxPartitionSize = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
if (Strings.isNullOrEmpty(basePersistDirectory)) {
basePersistDirectory = System.getProperty("java.io.tmpdir");
}
Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, null, true, 0, 0, true, null);
LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
return new DruidRecordWriter(dataSchema, realtimeTuningConfig, hdfsDataSegmentPusher, maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
Aggregations