Search in sources :

Example 1 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class AbstractSourceTest method testGetPreviousWorkUnitStatesOnFullRetryPartialCommit.

/**
 * Test when work unit retry policy is on full, but the job commit policy is "partial".
 */
@Test
public void testGetPreviousWorkUnitStatesOnFullRetryPartialCommit() {
    SourceState sourceState = new SourceState(new State(), this.previousWorkUnitStates);
    sourceState.setProp(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY, "onfull");
    sourceState.setProp(ConfigurationKeys.JOB_COMMIT_POLICY_KEY, "partial");
    Assert.assertEquals(this.testSource.getPreviousWorkUnitStatesForRetry(sourceState), Collections.EMPTY_LIST);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Example 2 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class AbstractSourceTest method testGetPreviousWorkUnitStatesEnabledRetry.

/**
 * Test the always-retry policy, with WORK_UNIT_RETRY_ENABLED_KEY enabled.
 */
@Test
public void testGetPreviousWorkUnitStatesEnabledRetry() {
    SourceState sourceState = new SourceState(new State(), this.previousWorkUnitStates);
    sourceState.setProp(ConfigurationKeys.WORK_UNIT_RETRY_ENABLED_KEY, Boolean.TRUE);
    List<WorkUnitState> returnedWorkUnitStates = this.testSource.getPreviousWorkUnitStatesForRetry(sourceState);
    Assert.assertEquals(returnedWorkUnitStates, this.expectedPreviousWorkUnitStates);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Example 3 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class EncryptionConfigParserTest method testWithWriterPrefix.

private void testWithWriterPrefix(int numBranches, int branch) {
    String branchString = "";
    if (numBranches > 1) {
        branchString = String.format(".%d", branch);
    }
    Properties properties = new Properties();
    properties.put(EncryptionConfigParser.WRITER_ENCRYPT_PREFIX + "." + EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY + branchString, "any");
    properties.put(EncryptionConfigParser.WRITER_ENCRYPT_PREFIX + "." + EncryptionConfigParser.ENCRYPTION_KEYSTORE_PATH_KEY + branchString, "/tmp/foobar");
    properties.put(EncryptionConfigParser.WRITER_ENCRYPT_PREFIX + "." + EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY + branchString, "abracadabra");
    State s = new State(properties);
    Map<String, Object> parsedProperties = EncryptionConfigParser.getConfigForBranch(EncryptionConfigParser.EntityType.WRITER, s, numBranches, branch);
    Assert.assertNotNull(parsedProperties, "Expected parser to only return one record");
    Assert.assertEquals(EncryptionConfigParser.getEncryptionType(parsedProperties), "any");
    Assert.assertEquals(EncryptionConfigParser.getKeystorePath(parsedProperties), "/tmp/foobar");
    Assert.assertEquals(EncryptionConfigParser.getKeystorePassword(parsedProperties), "abracadabra");
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) Properties(java.util.Properties)

Example 4 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class CsvFileDownloader method downloadFile.

/**
 * Provide iterator via OpenCSV's CSVReader.
 * Provides a way to skip top rows by providing regex.(This is useful when CSV file comes with comments on top rows, but not in fixed size.
 * It also provides validation on schema by matching header names between property's schema and header name in CSV file.
 *
 * {@inheritDoc}
 * @see org.apache.gobblin.source.extractor.filebased.FileDownloader#downloadFile(java.lang.String)
 */
@SuppressWarnings("unchecked")
@Override
public Iterator<String[]> downloadFile(String file) throws IOException {
    log.info("Beginning to download file: " + file);
    final State state = fileBasedExtractor.workUnitState;
    CSVReader reader;
    try {
        if (state.contains(DELIMITER)) {
            String delimiterStr = state.getProp(DELIMITER).trim();
            Preconditions.checkArgument(delimiterStr.length() == 1, "Delimiter should be a character.");
            char delimiter = delimiterStr.charAt(0);
            log.info("Using " + delimiter + " as a delimiter.");
            reader = this.fileBasedExtractor.getCloser().register(new CSVReader(new InputStreamReader(this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), delimiter));
        } else {
            reader = this.fileBasedExtractor.getCloser().register(new CSVReader(new InputStreamReader(this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING)));
        }
    } catch (FileBasedHelperException e) {
        throw new IOException(e);
    }
    PeekingIterator<String[]> iterator = Iterators.peekingIterator(reader.iterator());
    if (state.contains(SKIP_TOP_ROWS_REGEX)) {
        String regex = state.getProp(SKIP_TOP_ROWS_REGEX);
        log.info("Trying to skip with regex: " + regex);
        while (iterator.hasNext()) {
            String[] row = iterator.peek();
            if (row.length == 0) {
                break;
            }
            if (!row[0].matches(regex)) {
                break;
            }
            iterator.next();
        }
    }
    if (this.fileBasedExtractor.isShouldSkipFirstRecord() && iterator.hasNext()) {
        log.info("Skipping first record");
        iterator.next();
    }
    return iterator;
}
Also used : InputStreamReader(java.io.InputStreamReader) CSVReader(com.opencsv.CSVReader) State(org.apache.gobblin.configuration.State) IOException(java.io.IOException)

Example 5 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class HiveSerDeTest method testAvroOrcSerDes.

/**
 * This test uses Avro SerDe to deserialize data from Avro files, and use ORC SerDe
 * to serialize them into ORC files.
 */
@Test(groups = { "gobblin.serde" })
public void testAvroOrcSerDes() throws IOException, DataRecordException, DataConversionException {
    Properties properties = new Properties();
    properties.load(new FileReader("gobblin-core/src/test/resources/serde/serde.properties"));
    SourceState sourceState = new SourceState(new State(properties), ImmutableList.<WorkUnitState>of());
    OldApiWritableFileSource source = new OldApiWritableFileSource();
    List<WorkUnit> workUnits = source.getWorkunits(sourceState);
    Assert.assertEquals(workUnits.size(), 1);
    WorkUnitState wus = new WorkUnitState(workUnits.get(0));
    wus.addAll(sourceState);
    Closer closer = Closer.create();
    HiveWritableHdfsDataWriter writer = null;
    try {
        OldApiWritableFileExtractor extractor = closer.register((OldApiWritableFileExtractor) source.getExtractor(wus));
        HiveSerDeConverter converter = closer.register(new HiveSerDeConverter());
        writer = closer.register((HiveWritableHdfsDataWriter) new HiveWritableHdfsDataWriterBuilder<>().withBranches(1).withWriterId("0").writeTo(Destination.of(DestinationType.HDFS, sourceState)).writeInFormat(WriterOutputFormat.ORC).build());
        converter.init(wus);
        Writable record;
        while ((record = extractor.readRecord(null)) != null) {
            Iterable<Writable> convertedRecordIterable = converter.convertRecordImpl(null, record, wus);
            Assert.assertEquals(Iterators.size(convertedRecordIterable.iterator()), 1);
            writer.write(convertedRecordIterable.iterator().next());
        }
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
        if (writer != null) {
            writer.commit();
        }
        Assert.assertTrue(this.fs.exists(new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), sourceState.getProp(ConfigurationKeys.WRITER_FILE_NAME))));
        HadoopUtils.deletePath(this.fs, new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)), true);
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) SourceState(org.apache.gobblin.configuration.SourceState) OldApiWritableFileExtractor(org.apache.gobblin.source.extractor.hadoop.OldApiWritableFileExtractor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) HiveSerDeConverter(org.apache.gobblin.converter.serde.HiveSerDeConverter) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) HiveWritableHdfsDataWriterBuilder(org.apache.gobblin.writer.HiveWritableHdfsDataWriterBuilder) HiveWritableHdfsDataWriter(org.apache.gobblin.writer.HiveWritableHdfsDataWriter) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) FileReader(java.io.FileReader) OldApiWritableFileSource(org.apache.gobblin.source.extractor.hadoop.OldApiWritableFileSource) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

State (org.apache.gobblin.configuration.State)195 Test (org.testng.annotations.Test)103 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)74 SourceState (org.apache.gobblin.configuration.SourceState)38 Path (org.apache.hadoop.fs.Path)30 File (java.io.File)20 IOException (java.io.IOException)16 Map (java.util.Map)14 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)14 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)13 Properties (java.util.Properties)12 FinalState (org.apache.gobblin.util.FinalState)12 Configuration (org.apache.hadoop.conf.Configuration)12 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)9 Config (com.typesafe.config.Config)8 ArrayList (java.util.ArrayList)8 GenericRecord (org.apache.avro.generic.GenericRecord)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 FileInputStream (java.io.FileInputStream)6