Search in sources :

Example 1 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testWithFsMetricsNoPartitions.

@Test
public void testWithFsMetricsNoPartitions() throws IOException {
    File publishPath = Files.createTempDir();
    try {
        State s = buildDefaultState(1);
        String md = new GlobalMetadata().toJson();
        s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_DIR);
        s.setProp(ConfigurationKeys.DATA_PUBLISH_WRITER_METADATA_KEY, "true");
        s.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, publishPath.getAbsolutePath());
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_FILE, "metadata.json");
        WorkUnitState wuState1 = new WorkUnitState();
        FsWriterMetrics metrics1 = buildWriterMetrics("foo1.json", null, 0, 10);
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics1.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        addStateToWorkunit(s, wuState1);
        WorkUnitState wuState2 = new WorkUnitState();
        FsWriterMetrics metrics3 = buildWriterMetrics("foo3.json", null, 1, 30);
        wuState2.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        wuState2.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics3.toJson());
        addStateToWorkunit(s, wuState2);
        WorkUnitState wuState3 = new WorkUnitState();
        FsWriterMetrics metrics4 = buildWriterMetrics("foo4.json", null, 2, 55);
        wuState3.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        wuState3.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics4.toJson());
        addStateToWorkunit(s, wuState3);
        BaseDataPublisher publisher = new BaseDataPublisher(s);
        publisher.publishMetadata(ImmutableList.of(wuState1, wuState2, wuState3));
        checkMetadata(new File(publishPath.getAbsolutePath(), "metadata.json"), 3, 95, new FsWriterMetrics.FileInfo("foo3.json", 30), new FsWriterMetrics.FileInfo("foo1.json", 10), new FsWriterMetrics.FileInfo("foo4.json", 55));
    } finally {
        FileUtils.deleteDirectory(publishPath);
    }
}
Also used : GlobalMetadata(org.apache.gobblin.metadata.types.GlobalMetadata) TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) File(java.io.File) Test(org.testng.annotations.Test)

Example 2 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testWithFsMetricsAndPartitions.

@Test
public void testWithFsMetricsAndPartitions() throws IOException {
    File publishPath = Files.createTempDir();
    try {
        File part1 = new File(publishPath, "1-2-3-4");
        part1.mkdir();
        File part2 = new File(publishPath, "5-6-7-8");
        part2.mkdir();
        State s = buildDefaultState(1);
        String md = new GlobalMetadata().toJson();
        s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_DIR);
        s.setProp(ConfigurationKeys.DATA_PUBLISH_WRITER_METADATA_KEY, "true");
        s.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, publishPath.getAbsolutePath());
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_FILE, "metadata.json");
        WorkUnitState wuState1 = new WorkUnitState();
        FsWriterMetrics metrics1 = buildWriterMetrics("foo1.json", "1-2-3-4", 0, 10);
        FsWriterMetrics metrics2 = buildWriterMetrics("foo1.json", "5-6-7-8", 10, 20);
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY, "1-2-3-4");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics1.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + "_0", "1-2-3-4");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + " _0", metrics2.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + "_1", "5-6-7-8");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + " _1", metrics2.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        addStateToWorkunit(s, wuState1);
        WorkUnitState wuState2 = new WorkUnitState();
        FsWriterMetrics metrics3 = buildWriterMetrics("foo3.json", "1-2-3-4", 1, 30);
        wuState2.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY, "1-2-3-4");
        wuState2.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        wuState2.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics3.toJson());
        addStateToWorkunit(s, wuState2);
        WorkUnitState wuState3 = new WorkUnitState();
        FsWriterMetrics metrics4 = buildWriterMetrics("foo4.json", "5-6-7-8", 2, 55);
        wuState3.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY, "5-6-7-8");
        wuState3.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        wuState3.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics4.toJson());
        addStateToWorkunit(s, wuState3);
        BaseDataPublisher publisher = new BaseDataPublisher(s);
        publisher.publishMetadata(ImmutableList.of(wuState1, wuState2, wuState3));
        checkMetadata(new File(part1, "metadata.json"), 2, 40, new FsWriterMetrics.FileInfo("foo3.json", 30), new FsWriterMetrics.FileInfo("foo1.json", 10));
        checkMetadata(new File(part2, "metadata.json"), 2, 75, new FsWriterMetrics.FileInfo("foo1.json", 20), new FsWriterMetrics.FileInfo("foo4.json", 55));
    } finally {
        FileUtils.deleteDirectory(publishPath);
    }
}
Also used : GlobalMetadata(org.apache.gobblin.metadata.types.GlobalMetadata) TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) File(java.io.File) Test(org.testng.annotations.Test)

Example 3 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testMergesExistingMetadata.

@Test
public void testMergesExistingMetadata() throws IOException {
    File publishPath = Files.createTempDir();
    try {
        // Copy the metadata file from resources into the publish path
        InputStream mdStream = this.getClass().getClassLoader().getResourceAsStream("publisher/sample_metadata.json");
        try (FileOutputStream fOs = new FileOutputStream(new File(publishPath, "metadata.json"))) {
            IOUtils.copy(mdStream, fOs);
        }
        State s = buildDefaultState(1);
        String md = new GlobalMetadata().toJson();
        s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_DIR);
        s.setProp(ConfigurationKeys.DATA_PUBLISH_WRITER_METADATA_KEY, "true");
        s.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, publishPath.getAbsolutePath());
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_FILE, "metadata.json");
        WorkUnitState wuState1 = new WorkUnitState();
        FsWriterMetrics metrics1 = buildWriterMetrics("newfile.json", null, 0, 90);
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics1.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_METADATA_KEY, md);
        addStateToWorkunit(s, wuState1);
        BaseDataPublisher publisher = new BaseDataPublisher(s);
        publisher.publishMetadata(ImmutableList.of(wuState1));
        checkMetadata(new File(publishPath.getAbsolutePath(), "metadata.json"), 4, 185, new FsWriterMetrics.FileInfo("foo3.json", 30), new FsWriterMetrics.FileInfo("foo1.json", 10), new FsWriterMetrics.FileInfo("foo4.json", 55), new FsWriterMetrics.FileInfo("newfile.json", 90));
    } finally {
        FileUtils.deleteDirectory(publishPath);
    }
}
Also used : GlobalMetadata(org.apache.gobblin.metadata.types.GlobalMetadata) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FileOutputStream(java.io.FileOutputStream) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) File(java.io.File) Test(org.testng.annotations.Test)

Example 4 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testWithFsMetricsBranchesAndPartitions.

@Test
public void testWithFsMetricsBranchesAndPartitions() throws IOException {
    File[] publishPaths = new File[] { // branch 0
    Files.createTempDir(), // branch 1
    Files.createTempDir() };
    try {
        List<File[]> branchPaths = Arrays.stream(publishPaths).map(branchPath -> new File[] { new File(branchPath, "1-2-3-4"), new File(branchPath, "5-6-7-8") }).collect(Collectors.toList());
        branchPaths.forEach(partitionPaths -> Arrays.stream(partitionPaths).forEach(File::mkdir));
        State s = buildDefaultState(2);
        String md = new GlobalMetadata().toJson();
        s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_DIR);
        s.setProp(ConfigurationKeys.DATA_PUBLISH_WRITER_METADATA_KEY + ".0", "true");
        s.setProp(ConfigurationKeys.DATA_PUBLISH_WRITER_METADATA_KEY + ".1", "true");
        s.setProp(ConfigurationKeys.WRITER_METADATA_KEY + ".0", md);
        s.setProp(ConfigurationKeys.WRITER_METADATA_KEY + ".1", md);
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR + ".0", publishPaths[0].getAbsolutePath());
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR + ".1", publishPaths[1].getAbsolutePath());
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR + ".0", "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR + ".1", "false");
        s.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_FILE, "metadata.json");
        WorkUnitState wuState1 = new WorkUnitState();
        FsWriterMetrics metrics1 = buildWriterMetrics("foo1.json", "1-2-3-4", 0, 10);
        FsWriterMetrics metrics2 = buildWriterMetrics("foo1.json", "5-6-7-8", 10, 20);
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + ".0", "1-2-3-4");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + ".0", metrics1.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + ".0_0", "1-2-3-4");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + ".0_0", metrics2.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + ".0" + "_1", "5-6-7-8");
        wuState1.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + ".0_1", metrics2.toJson());
        wuState1.setProp(ConfigurationKeys.WRITER_METADATA_KEY + ".0", md);
        addStateToWorkunit(s, wuState1);
        WorkUnitState wuState2 = new WorkUnitState();
        FsWriterMetrics metrics3 = buildWriterMetrics("foo3.json", "1-2-3-4", 1, 1, 30);
        wuState2.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + ".1", "1-2-3-4");
        wuState2.setProp(ConfigurationKeys.WRITER_METADATA_KEY + ".1", md);
        wuState2.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + ".1", metrics3.toJson());
        addStateToWorkunit(s, wuState2);
        WorkUnitState wuState3 = new WorkUnitState();
        FsWriterMetrics metrics4 = buildWriterMetrics("foo4.json", "5-6-7-8", 2, 55);
        wuState3.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + ".0", "5-6-7-8");
        wuState3.setProp(ConfigurationKeys.WRITER_METADATA_KEY + ".0", md);
        wuState3.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + ".0", metrics4.toJson());
        addStateToWorkunit(s, wuState3);
        BaseDataPublisher publisher = new BaseDataPublisher(s);
        publisher.publishMetadata(ImmutableList.of(wuState1, wuState2, wuState3));
        checkMetadata(new File(branchPaths.get(0)[0], "metadata.json.0"), 1, 10, new FsWriterMetrics.FileInfo("foo1.json", 10));
        checkMetadata(new File(branchPaths.get(0)[1], "metadata.json.0"), 2, 75, new FsWriterMetrics.FileInfo("foo1.json", 20), new FsWriterMetrics.FileInfo("foo4.json", 55));
        checkMetadata(new File(branchPaths.get(1)[0], "metadata.json.1"), 1, 30, new FsWriterMetrics.FileInfo("foo3.json", 30));
    } finally {
        Arrays.stream(publishPaths).forEach(dir -> {
            try {
                FileUtils.deleteDirectory(dir);
            } catch (IOException e) {
                throw new RuntimeException("IOError");
            }
        });
    }
}
Also used : ForkOperatorUtils(org.apache.gobblin.util.ForkOperatorUtils) Arrays(java.util.Arrays) TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) TaskScopeInstance(org.apache.gobblin.broker.gobblin_scopes.TaskScopeInstance) Test(org.testng.annotations.Test) ArrayList(java.util.ArrayList) Assert(org.testng.Assert) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) MetadataMerger(org.apache.gobblin.metadata.MetadataMerger) Map(java.util.Map) GlobalMetadata(org.apache.gobblin.metadata.types.GlobalMetadata) Path(org.apache.hadoop.fs.Path) ConfigFactory(com.typesafe.config.ConfigFactory) GobblinScopeTypes(org.apache.gobblin.broker.gobblin_scopes.GobblinScopeTypes) PartitionIdentifier(org.apache.gobblin.writer.PartitionIdentifier) Properties(java.util.Properties) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) FileOutputStream(java.io.FileOutputStream) Set(java.util.Set) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) FileInputStream(java.io.FileInputStream) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Collectors(java.util.stream.Collectors) FsDataWriter(org.apache.gobblin.writer.FsDataWriter) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) SharedResourcesBrokerFactory(org.apache.gobblin.broker.SharedResourcesBrokerFactory) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) IOUtils(org.apache.commons.io.IOUtils) JobScopeInstance(org.apache.gobblin.broker.gobblin_scopes.JobScopeInstance) List(java.util.List) SubscopedBrokerBuilder(org.apache.gobblin.broker.iface.SubscopedBrokerBuilder) SharedResourcesBroker(org.apache.gobblin.broker.iface.SharedResourcesBroker) Collections(java.util.Collections) InputStream(java.io.InputStream) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) GlobalMetadata(org.apache.gobblin.metadata.types.GlobalMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) File(java.io.File) Test(org.testng.annotations.Test)

Example 5 with FsWriterMetrics

use of org.apache.gobblin.writer.FsWriterMetrics in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testNoOutputWhenDisabledWithPartitions.

@Test
public void testNoOutputWhenDisabledWithPartitions() throws IOException {
    File publishPath = Files.createTempDir();
    State s = buildDefaultState(1);
    s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_DIR);
    s.removeProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_OUTPUT_FILE);
    s.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, publishPath.getAbsolutePath());
    WorkUnitState wuState = new WorkUnitState();
    addStateToWorkunit(s, wuState);
    wuState.setProp(ConfigurationKeys.WRITER_METADATA_KEY, "abcdefg");
    FsWriterMetrics metrics1 = buildWriterMetrics("foo1.json", "1-2-3-4", 0, 10);
    FsWriterMetrics metrics2 = buildWriterMetrics("foo1.json", "5-6-7-8", 10, 20);
    wuState.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY, "1-2-3-4");
    wuState.setProp(FsDataWriter.FS_WRITER_METRICS_KEY, metrics1.toJson());
    wuState.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + "_0", "1-2-3-4");
    wuState.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + " _0", metrics2.toJson());
    wuState.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + "_1", "5-6-7-8");
    wuState.setProp(FsDataWriter.FS_WRITER_METRICS_KEY + " _1", metrics2.toJson());
    BaseDataPublisher publisher = new BaseDataPublisher(s);
    publisher.publishMetadata(Collections.singletonList(wuState));
    String[] filesInPublishDir = publishPath.list();
    Assert.assertEquals(0, filesInPublishDir.length, "Expected 0 files to be output to publish path");
}
Also used : TaskState(org.apache.hadoop.mapreduce.v2.api.records.TaskState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FsWriterMetrics(org.apache.gobblin.writer.FsWriterMetrics) File(java.io.File) Test(org.testng.annotations.Test)

Aggregations

State (org.apache.gobblin.configuration.State)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 FsWriterMetrics (org.apache.gobblin.writer.FsWriterMetrics)6 File (java.io.File)5 GlobalMetadata (org.apache.gobblin.metadata.types.GlobalMetadata)4 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)4 Test (org.testng.annotations.Test)4 ImmutableList (com.google.common.collect.ImmutableList)2 ConfigFactory (com.typesafe.config.ConfigFactory)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Collections (java.util.Collections)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 IOUtils (org.apache.commons.io.IOUtils)2 ConfigurationKeys (org.apache.gobblin.configuration.ConfigurationKeys)2