Search in sources :

Example 46 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class HadoopFsHelperTest method testGetFileStreamSucceedsWithUncompressedFile.

@Test
public void testGetFileStreamSucceedsWithUncompressedFile() throws FileBasedHelperException, IOException {
    SourceState sourceState = new SourceState();
    URL rootUrl = getClass().getResource("/source/");
    String rootPath = rootUrl.toString();
    sourceState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, rootPath);
    HadoopFsHelper fsHelper = new HadoopFsHelper(sourceState);
    fsHelper.connect();
    URL url = getClass().getResource("/source/simple.tsv");
    String path = url.toString();
    InputStream in = fsHelper.getFileStream(path);
    String contents = IOUtils.toString(in, "UTF-8");
    Assert.assertEquals(contents, "A\t1\nB\t2\n");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) InputStream(java.io.InputStream) URL(java.net.URL) Test(org.testng.annotations.Test)

Example 47 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class HadoopFsHelperTest method testConnectFailsWithS3URLWithoutAWSCredentials.

@Test(expectedExceptions = IllegalArgumentException.class)
public void testConnectFailsWithS3URLWithoutAWSCredentials() throws FileBasedHelperException {
    // plain conf, no S3 credentials
    Configuration conf = new Configuration();
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "s3://support.elasticmapreduce/spark/install-spark/");
    HadoopFsHelper fsHelper = new HadoopFsHelper(sourceState, conf);
    fsHelper.connect();
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Configuration(org.apache.hadoop.conf.Configuration) Test(org.testng.annotations.Test)

Example 48 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class OldApiHadoopFileInputSourceTest method setUp.

@BeforeClass
public void setUp() throws IOException {
    File textFile = new File(getFileDir(), "test.txt");
    File dir = textFile.getParentFile();
    if (!dir.exists() && !dir.mkdir()) {
        throw new IOException("Failed to create directory: " + dir);
    }
    if (!textFile.createNewFile()) {
        throw new IOException("Failed to create text file: " + textFile);
    }
    Files.write(TEXT, textFile, ConfigurationKeys.DEFAULT_CHARSET_ENCODING);
    this.sourceState = new SourceState();
    this.sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, Extract.TableType.SNAPSHOT_ONLY.toString());
    this.sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "test");
    this.sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, "test");
    this.sourceState.setProp(HadoopFileInputSource.FILE_INPUT_PATHS_KEY, textFile.getAbsolutePath());
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) IOException(java.io.IOException) File(java.io.File) BeforeClass(org.testng.annotations.BeforeClass)

Example 49 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetPartitionList.

@Test
public void testGetPartitionList() {
    List<Partition> expectedPartitions = new ArrayList<>();
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE, true);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    long defaultValue = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
    expectedPartitions.add(new Partition(defaultValue, defaultValue, true, false));
    // Watermark doesn't exist
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Set watermark
    sourceState.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, "time");
    // Set other properties
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "hour");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL, "2");
    sourceState.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, "2");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE, true);
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(defaultValue, Long.parseLong(TestPartitioner.currentTimeString), true, false));
    // No user specified watermarks
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Set user specified low and high watermarks
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE, "20170101002010");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, "20170101122010");
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(20170101000000L, 20170101060000L));
    expectedPartitions.add(new Partition(20170101060000L, 20170101120000L, true, true));
    List<Partition> partitions = partitioner.getPartitionList(-1);
    Collections.sort(partitions, Partitioner.ascendingComparator);
    Assert.assertEquals(partitions, expectedPartitions);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ArrayList(java.util.ArrayList) Test(org.testng.annotations.Test)

Example 50 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetLowWatermarkOnAppendExtract.

/**
 * Test getLowWatermark. Extract type: Append.
 */
@Test
public void testGetLowWatermarkOnAppendExtract() {
    SourceState sourceState = new SourceState();
    String startValue = "20140101000000";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE, startValue);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    ExtractType extractType = ExtractType.APPEND_DAILY;
    int delta = 1;
    // No previous watermark
    Assert.assertEquals(partitioner.getLowWatermark(extractType, null, ConfigurationKeys.DEFAULT_WATERMARK_VALUE, delta), Long.parseLong(startValue), "Low watermark should be " + startValue);
    // With previous watermark
    long previousWatermark = 20140101000050L;
    long expected = previousWatermark + delta;
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
    // The result has nothing to do with SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS
    int backupSecs = 10;
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, backupSecs);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4