use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.
the class IndexTaskTest method testNumShardsAndPartitionDimensionsProvided.
@Test
public void testNumShardsAndPartitionDimensionsProvided() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("2014-01-01T00:00:10Z,a,1\n");
writer.write("2014-01-01T01:00:20Z,b,1\n");
writer.write("2014-01-01T02:00:30Z,c,1\n");
}
final IndexTask indexTask = new IndexTask(null, null, createDefaultIngestionSpec(jsonMapper, tmpDir, null, null, createTuningConfigWithPartitionsSpec(new HashedPartitionsSpec(null, 2, ImmutableList.of("dim")), true), false, false), null);
final List<DataSegment> segments = runTask(indexTask).rhs;
Assert.assertEquals(2, segments.size());
for (DataSegment segment : segments) {
Assert.assertEquals(DATASOURCE, segment.getDataSource());
Assert.assertEquals(Intervals.of("2014/P1D"), segment.getInterval());
Assert.assertEquals(HashBasedNumberedShardSpec.class, segment.getShardSpec().getClass());
final HashBasedNumberedShardSpec hashBasedNumberedShardSpec = (HashBasedNumberedShardSpec) segment.getShardSpec();
Assert.assertEquals(HashPartitionFunction.MURMUR3_32_ABS, hashBasedNumberedShardSpec.getPartitionFunction());
final File segmentFile = segmentCacheManager.getSegmentFiles(segment);
final WindowedStorageAdapter adapter = new WindowedStorageAdapter(new QueryableIndexStorageAdapter(indexIO.loadIndex(segmentFile)), segment.getInterval());
final Sequence<Cursor> cursorSequence = adapter.getAdapter().makeCursors(null, segment.getInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
final List<Integer> hashes = cursorSequence.map(cursor -> {
final DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dim", "dim"));
final int hash = HashPartitionFunction.MURMUR3_32_ABS.hash(HashBasedNumberedShardSpec.serializeGroupKey(jsonMapper, Collections.singletonList(selector.getObject())), hashBasedNumberedShardSpec.getNumBuckets());
cursor.advance();
return hash;
}).toList();
Assert.assertTrue(hashes.stream().allMatch(h -> h.intValue() == hashes.get(0)));
}
}
use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.
the class BatchDeltaIngestionTest method testIngestion.
private void testIngestion(HadoopDruidIndexerConfig config, List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment, List<String> expectedDimensions, List<String> expectedMetrics) throws Exception {
IndexGeneratorJob job = new IndexGeneratorJob(config);
Assert.assertTrue(JobHelper.runJobs(ImmutableList.of(job)));
List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths = IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
JobHelper.renameIndexFilesForSegments(config.getSchema(), dataSegmentAndIndexZipFilePaths);
JobHelper.maybeDeleteIntermediatePath(true, config.getSchema());
File workingPath = new File(config.makeIntermediatePath().toUri().getPath());
Assert.assertFalse(workingPath.exists());
File segmentFolder = new File(StringUtils.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(), INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));
Assert.assertTrue(segmentFolder.exists());
File indexZip = new File(segmentFolder, "index.zip");
Assert.assertTrue(indexZip.exists());
File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
new LocalDataSegmentPuller().getSegmentFiles(indexZip, tmpUnzippedSegmentDir);
QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())), TransformSpec.NONE, expectedDimensions, expectedMetrics, null);
List<InputRow> rows = new ArrayList<>();
while (firehose.hasMore()) {
rows.add(firehose.nextRow());
}
verifyRows(expectedRowsGenerated, rows, expectedDimensions, expectedMetrics);
}
use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.
the class DatasourceRecordReader method initialize.
@Override
public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException {
List<WindowedDataSegment> segments = ((DatasourceInputSplit) split).getSegments();
String dataSource = Iterators.getOnlyElement(segments.stream().map(s -> s.getSegment().getDataSource()).distinct().iterator());
spec = DatasourceInputFormat.getIngestionSpec(context.getConfiguration(), dataSource);
logger.info("load schema [%s]", spec);
List<WindowedStorageAdapter> adapters = Lists.transform(segments, new Function<WindowedDataSegment, WindowedStorageAdapter>() {
@Override
public WindowedStorageAdapter apply(WindowedDataSegment segment) {
try {
logger.info("Getting storage path for segment [%s]", segment.getSegment().getId());
Path path = new Path(JobHelper.getURIFromSegment(segment.getSegment()));
logger.info("Fetch segment files from [%s]", path);
File dir = FileUtils.createTempDir();
tmpSegmentDirs.add(dir);
logger.info("Locally storing fetched segment at [%s]", dir);
JobHelper.unzipNoGuava(path, context.getConfiguration(), dir, context, null);
logger.info("finished fetching segment files");
QueryableIndex index = HadoopDruidIndexerConfig.INDEX_IO.loadIndex(dir);
indexes.add(index);
numRows += index.getNumRows();
return new WindowedStorageAdapter(new QueryableIndexStorageAdapter(index), segment.getInterval());
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
});
firehose = new IngestSegmentFirehose(adapters, spec.getTransformSpec(), spec.getDimensions(), spec.getMetrics(), spec.getFilter());
}
Aggregations