use of io.druid.timeline.DataSegment in project druid by druid-io.
the class DataSegmentPusherUtilTest method shouldNotHaveColonsInHdfsStorageDir.
@Test
public void shouldNotHaveColonsInHdfsStorageDir() throws Exception {
Interval interval = new Interval("2011-10-01/2011-10-02");
ImmutableMap<String, Object> loadSpec = ImmutableMap.<String, Object>of("something", "or_other");
DataSegment segment = new DataSegment("something", interval, "brand:new:version", loadSpec, Arrays.asList("dim1", "dim2"), Arrays.asList("met1", "met2"), NoneShardSpec.instance(), null, 1);
String storageDir = DataSegmentPusherUtil.getHdfsStorageDir(segment);
Assert.assertEquals("something/20111001T000000.000Z_20111002T000000.000Z/brand_new_version", storageDir);
}
use of io.druid.timeline.DataSegment in project druid by druid-io.
the class HdfsDataSegmentFinderTest method testFindSegments.
@Test
public void testFindSegments() throws Exception {
final HdfsDataSegmentFinder hdfsDataSegmentFinder = new HdfsDataSegmentFinder(conf, mapper);
final Set<DataSegment> segments = hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), false);
Assert.assertEquals(6, segments.size());
DataSegment updatedSegment1 = null;
DataSegment updatedSegment2 = null;
DataSegment updatedSegment3 = null;
DataSegment updatedSegment4_0 = null;
DataSegment updatedSegment4_1 = null;
DataSegment updatedSegment5 = null;
for (DataSegment dataSegment : segments) {
if (dataSegment.getIdentifier().equals(SEGMENT_1.getIdentifier())) {
updatedSegment1 = dataSegment;
} else if (dataSegment.getIdentifier().equals(SEGMENT_2.getIdentifier())) {
updatedSegment2 = dataSegment;
} else if (dataSegment.getIdentifier().equals(SEGMENT_3.getIdentifier())) {
updatedSegment3 = dataSegment;
} else if (dataSegment.getIdentifier().equals(SEGMENT_4_0.getIdentifier())) {
updatedSegment4_0 = dataSegment;
} else if (dataSegment.getIdentifier().equals(SEGMENT_4_1.getIdentifier())) {
updatedSegment4_1 = dataSegment;
} else if (dataSegment.getIdentifier().equals(SEGMENT_5.getIdentifier())) {
updatedSegment5 = dataSegment;
} else {
Assert.fail("Unexpected segment");
}
}
Assert.assertEquals(descriptor1.toUri().getPath(), getDescriptorPath(updatedSegment1));
Assert.assertEquals(descriptor2.toUri().getPath(), getDescriptorPath(updatedSegment2));
Assert.assertEquals(descriptor3.toUri().getPath(), getDescriptorPath(updatedSegment3));
Assert.assertEquals(descriptor4_0.toUri().getPath(), getDescriptorPath(updatedSegment4_0));
Assert.assertEquals(descriptor4_1.toUri().getPath(), getDescriptorPath(updatedSegment4_1));
Assert.assertEquals(descriptor5.toUri().getPath(), getDescriptorPathWithPartitionNum(updatedSegment5, 1));
final String serializedSegment1 = mapper.writeValueAsString(updatedSegment1);
final String serializedSegment2 = mapper.writeValueAsString(updatedSegment2);
final String serializedSegment3 = mapper.writeValueAsString(updatedSegment3);
final String serializedSegment4_0 = mapper.writeValueAsString(updatedSegment4_0);
final String serializedSegment4_1 = mapper.writeValueAsString(updatedSegment4_1);
final String serializedSegment5 = mapper.writeValueAsString(updatedSegment5);
// since updateDescriptor was not enabled, descriptor.json still has stale information
Assert.assertNotEquals(serializedSegment1, readContent(descriptor1));
Assert.assertNotEquals(serializedSegment2, readContent(descriptor2));
Assert.assertNotEquals(serializedSegment3, readContent(descriptor3));
Assert.assertNotEquals(serializedSegment4_0, readContent(descriptor4_0));
Assert.assertNotEquals(serializedSegment4_1, readContent(descriptor4_1));
Assert.assertNotEquals(serializedSegment5, readContent(descriptor5));
// enable updateDescriptor so that descriptors.json will be updated to relfect the new loadSpec
final Set<DataSegment> segments2 = hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), true);
Assert.assertEquals(segments, segments2);
Assert.assertEquals(serializedSegment1, readContent(descriptor1));
Assert.assertEquals(serializedSegment2, readContent(descriptor2));
Assert.assertEquals(serializedSegment3, readContent(descriptor3));
Assert.assertEquals(serializedSegment4_0, readContent(descriptor4_0));
Assert.assertEquals(serializedSegment4_1, readContent(descriptor4_1));
Assert.assertEquals(serializedSegment5, readContent(descriptor5));
}
use of io.druid.timeline.DataSegment in project druid by druid-io.
the class HdfsDataSegmentFinder method findSegments.
@Override
public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor) throws SegmentLoadingException {
final Set<DataSegment> segments = Sets.newHashSet();
final Path workingDirPath = new Path(workingDirPathStr);
FileSystem fs;
try {
fs = workingDirPath.getFileSystem(config);
log.info(fs.getScheme());
log.info("FileSystem URI:" + fs.getUri().toString());
if (!fs.exists(workingDirPath)) {
throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath);
}
if (!fs.isDirectory(workingDirPath)) {
throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath);
}
final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true);
while (it.hasNext()) {
final LocatedFileStatus locatedFileStatus = it.next();
final Path path = locatedFileStatus.getPath();
if (path.getName().endsWith("descriptor.json")) {
final Path indexZip;
final String[] descriptorParts = path.getName().split("_");
if (descriptorParts.length == 2 && descriptorParts[1].equals("descriptor.json") && StringUtils.isNumeric(descriptorParts[0])) {
indexZip = new Path(path.getParent(), String.format("%s_index.zip", descriptorParts[0]));
} else {
indexZip = new Path(path.getParent(), "index.zip");
}
if (fs.exists(indexZip)) {
final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class);
log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip);
final Map<String, Object> loadSpec = dataSegment.getLoadSpec();
final String pathWithoutScheme = indexZip.toUri().getPath();
if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME) || !loadSpec.get("path").equals(pathWithoutScheme)) {
loadSpec.put("type", HdfsStorageDruidModule.SCHEME);
loadSpec.put("path", pathWithoutScheme);
if (updateDescriptor) {
log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path, pathWithoutScheme);
mapper.writeValue(fs.create(path, true), dataSegment);
}
}
segments.add(dataSegment);
} else {
throw new SegmentLoadingException("index.zip didn't exist at [%s] while descripter.json exists!?", indexZip);
}
}
}
} catch (IOException e) {
throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath);
}
return segments;
}
use of io.druid.timeline.DataSegment in project druid by druid-io.
the class HdfsDataSegmentPusher method push.
@Override
public DataSegment push(File inDir, DataSegment segment) throws IOException {
final String storageDir = DataSegmentPusherUtil.getHdfsStorageDir(segment);
log.info("Copying segment[%s] to HDFS at location[%s/%s]", segment.getIdentifier(), fullyQualifiedStorageDirectory, storageDir);
Path tmpIndexFile = new Path(String.format("%s/%s/%s/%s_index.zip", fullyQualifiedStorageDirectory, segment.getDataSource(), UUIDUtils.generateUuid(), segment.getShardSpec().getPartitionNum()));
FileSystem fs = tmpIndexFile.getFileSystem(hadoopConfig);
fs.mkdirs(tmpIndexFile.getParent());
log.info("Compressing files from[%s] to [%s]", inDir, tmpIndexFile);
final long size;
final DataSegment dataSegment;
try (FSDataOutputStream out = fs.create(tmpIndexFile)) {
size = CompressionUtils.zip(inDir, out);
final Path outIndexFile = new Path(String.format("%s/%s/%d_index.zip", fullyQualifiedStorageDirectory, storageDir, segment.getShardSpec().getPartitionNum()));
final Path outDescriptorFile = new Path(String.format("%s/%s/%d_descriptor.json", fullyQualifiedStorageDirectory, storageDir, segment.getShardSpec().getPartitionNum()));
dataSegment = segment.withLoadSpec(makeLoadSpec(outIndexFile)).withSize(size).withBinaryVersion(SegmentUtils.getVersionFromDir(inDir));
final Path tmpDescriptorFile = new Path(tmpIndexFile.getParent(), String.format("%s_descriptor.json", dataSegment.getShardSpec().getPartitionNum()));
log.info("Creating descriptor file at[%s]", tmpDescriptorFile);
ByteSource.wrap(jsonMapper.writeValueAsBytes(dataSegment)).copyTo(new HdfsOutputStreamSupplier(fs, tmpDescriptorFile));
// Create parent if it does not exist, recreation is not an error
fs.mkdirs(outIndexFile.getParent());
copyFilesWithChecks(fs, tmpDescriptorFile, outDescriptorFile);
copyFilesWithChecks(fs, tmpIndexFile, outIndexFile);
} finally {
try {
if (fs.exists(tmpIndexFile.getParent()) && !fs.delete(tmpIndexFile.getParent(), true)) {
log.error("Failed to delete temp directory[%s]", tmpIndexFile.getParent());
}
} catch (IOException ex) {
log.error(ex, "Failed to delete temp directory[%s]", tmpIndexFile.getParent());
}
}
return dataSegment;
}
use of io.druid.timeline.DataSegment in project druid by druid-io.
the class HdfsDataSegmentPusherTest method testUsingScheme.
private void testUsingScheme(final String scheme) throws Exception {
Configuration conf = new Configuration(true);
// Create a mock segment on disk
File segmentDir = tempFolder.newFolder();
File tmp = new File(segmentDir, "version.bin");
final byte[] data = new byte[] { 0x0, 0x0, 0x0, 0x1 };
Files.write(data, tmp);
final long size = data.length;
HdfsDataSegmentPusherConfig config = new HdfsDataSegmentPusherConfig();
final File storageDirectory = tempFolder.newFolder();
config.setStorageDirectory(scheme != null ? String.format("%s://%s", scheme, storageDirectory.getAbsolutePath()) : storageDirectory.getAbsolutePath());
HdfsDataSegmentPusher pusher = new HdfsDataSegmentPusher(config, conf, new DefaultObjectMapper());
DataSegment segmentToPush = new DataSegment("foo", new Interval("2015/2016"), "0", Maps.<String, Object>newHashMap(), Lists.<String>newArrayList(), Lists.<String>newArrayList(), NoneShardSpec.instance(), 0, size);
DataSegment segment = pusher.push(segmentDir, segmentToPush);
String indexUri = String.format("%s/%s/%d_index.zip", FileSystem.newInstance(conf).makeQualified(new Path(config.getStorageDirectory())).toUri().toString(), DataSegmentPusherUtil.getHdfsStorageDir(segmentToPush), segmentToPush.getShardSpec().getPartitionNum());
Assert.assertEquals(segmentToPush.getSize(), segment.getSize());
Assert.assertEquals(segmentToPush, segment);
Assert.assertEquals(ImmutableMap.of("type", "hdfs", "path", indexUri), segment.getLoadSpec());
// rename directory after push
final String segmentPath = DataSegmentPusherUtil.getHdfsStorageDir(segment);
File indexFile = new File(String.format("%s/%s/%d_index.zip", storageDirectory, segmentPath, segment.getShardSpec().getPartitionNum()));
Assert.assertTrue(indexFile.exists());
File descriptorFile = new File(String.format("%s/%s/%d_descriptor.json", storageDirectory, segmentPath, segment.getShardSpec().getPartitionNum()));
Assert.assertTrue(descriptorFile.exists());
// push twice will fail and temp dir cleaned
File outDir = new File(String.format("%s/%s", config.getStorageDirectory(), segmentPath));
outDir.setReadOnly();
try {
pusher.push(segmentDir, segmentToPush);
} catch (IOException e) {
Assert.fail("should not throw exception");
}
}
Aggregations