Search in sources :

Example 76 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class RecoveryHelperTest method testPurge.

@Test
public void testPurge() throws Exception {
    String content = "contents";
    File persistDirBase = Files.createTempDir();
    persistDirBase.deleteOnExit();
    State state = new State();
    state.setProp(RecoveryHelper.PERSIST_DIR_KEY, persistDirBase.getAbsolutePath());
    state.setProp(RecoveryHelper.PERSIST_RETENTION_KEY, "1");
    RecoveryHelper recoveryHelper = new RecoveryHelper(FileSystem.getLocal(new Configuration()), state);
    File persistDir = new File(RecoveryHelper.getPersistDir(state).get().toString());
    persistDir.mkdir();
    File file = new File(persistDir, "file1");
    OutputStream os = new FileOutputStream(file);
    IOUtils.write(content, os);
    os.close();
    file.setLastModified(System.currentTimeMillis() - TimeUnit.HOURS.toMillis(2));
    File file2 = new File(persistDir, "file2");
    OutputStream os2 = new FileOutputStream(file2);
    IOUtils.write(content, os2);
    os2.close();
    Assert.assertEquals(persistDir.listFiles().length, 2);
    recoveryHelper.purgeOldPersistedFile();
    Assert.assertEquals(persistDir.listFiles().length, 1);
}
Also used : CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Configuration(org.apache.hadoop.conf.Configuration) State(org.apache.gobblin.configuration.State) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) Test(org.testng.annotations.Test)

Example 77 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class RecoveryHelperTest method testPersistFile.

@Test
public void testPersistFile() throws Exception {
    String content = "contents";
    File stagingDir = Files.createTempDir();
    stagingDir.deleteOnExit();
    File file = new File(stagingDir, "file");
    OutputStream os = new FileOutputStream(file);
    IOUtils.write(content, os);
    os.close();
    Assert.assertEquals(stagingDir.listFiles().length, 1);
    State state = new State();
    state.setProp(RecoveryHelper.PERSIST_DIR_KEY, this.tmpDir.getAbsolutePath());
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");
    File recoveryDir = new File(RecoveryHelper.getPersistDir(state).get().toUri().getPath());
    FileSystem fs = FileSystem.getLocal(new Configuration());
    CopyableFile copyableFile = CopyableFile.builder(fs, new FileStatus(0, false, 0, 0, 0, new Path("/file")), new Path("/dataset"), CopyConfiguration.builder(fs, state.getProperties()).preserve(PreserveAttributes.fromMnemonicString("")).build()).build();
    CopySource.setWorkUnitGuid(state, Guid.fromHasGuid(copyableFile));
    RecoveryHelper recoveryHelper = new RecoveryHelper(FileSystem.getLocal(new Configuration()), state);
    recoveryHelper.persistFile(state, copyableFile, new Path(file.getAbsolutePath()));
    Assert.assertEquals(stagingDir.listFiles().length, 0);
    Assert.assertEquals(recoveryDir.listFiles().length, 1);
    File fileInRecovery = recoveryDir.listFiles()[0].listFiles()[0];
    Assert.assertEquals(IOUtils.readLines(new FileInputStream(fileInRecovery)).get(0), content);
    Optional<FileStatus> fileToRecover = recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysTrue());
    Assert.assertTrue(fileToRecover.isPresent());
    Assert.assertEquals(fileToRecover.get().getPath().toUri().getPath(), fileInRecovery.getAbsolutePath());
    fileToRecover = recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysFalse());
    Assert.assertFalse(fileToRecover.isPresent());
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Configuration(org.apache.hadoop.conf.Configuration) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileInputStream(java.io.FileInputStream) State(org.apache.gobblin.configuration.State) FileOutputStream(java.io.FileOutputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) File(java.io.File) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) Test(org.testng.annotations.Test)

Example 78 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class ConfigStoreUtils method getTopicsFromConfigStore.

/**
 * Get topics from config store.
 * Topics will either be whitelisted or blacklisted using tag.
 * After filtering out topics via tag, their config property is checked.
 * For each shortlisted topic, config must contain either property topic.blacklist or topic.whitelist
 *
 * If tags are not provided, it will return all topics
 */
public static List<KafkaTopic> getTopicsFromConfigStore(Properties properties, String configStoreUri, GobblinKafkaConsumerClient kafkaConsumerClient) {
    ConfigClient configClient = ConfigClient.createConfigClient(VersionStabilityPolicy.WEAK_LOCAL_STABILITY);
    State state = new State();
    state.setProp(KafkaSource.TOPIC_WHITELIST, ".*");
    state.setProp(KafkaSource.TOPIC_BLACKLIST, StringUtils.EMPTY);
    List<KafkaTopic> allTopics = kafkaConsumerClient.getFilteredTopics(DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_BLACKLIST), DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_WHITELIST));
    Optional<Config> runtimeConfig = ConfigClientUtils.getOptionalRuntimeConfig(properties);
    if (properties.containsKey(GOBBLIN_CONFIG_TAGS_WHITELIST)) {
        Preconditions.checkArgument(properties.containsKey(GOBBLIN_CONFIG_FILTER), "Missing required property " + GOBBLIN_CONFIG_FILTER);
        String filterString = properties.getProperty(GOBBLIN_CONFIG_FILTER);
        Path whiteListTagUri = PathUtils.mergePaths(new Path(configStoreUri), new Path(properties.getProperty(GOBBLIN_CONFIG_TAGS_WHITELIST)));
        List<String> whitelistedTopics = new ArrayList<>();
        ConfigStoreUtils.getTopicsURIFromConfigStore(configClient, whiteListTagUri, filterString, runtimeConfig).stream().filter((URI u) -> ConfigUtils.getBoolean(ConfigStoreUtils.getConfig(configClient, u, runtimeConfig), KafkaSource.TOPIC_WHITELIST, false)).forEach(((URI u) -> whitelistedTopics.add(ConfigStoreUtils.getTopicNameFromURI(u))));
        return allTopics.stream().filter((KafkaTopic p) -> whitelistedTopics.contains(p.getName())).collect(Collectors.toList());
    } else if (properties.containsKey(GOBBLIN_CONFIG_TAGS_BLACKLIST)) {
        Preconditions.checkArgument(properties.containsKey(GOBBLIN_CONFIG_FILTER), "Missing required property " + GOBBLIN_CONFIG_FILTER);
        String filterString = properties.getProperty(GOBBLIN_CONFIG_FILTER);
        Path blackListTagUri = PathUtils.mergePaths(new Path(configStoreUri), new Path(properties.getProperty(GOBBLIN_CONFIG_TAGS_BLACKLIST)));
        List<String> blacklistedTopics = new ArrayList<>();
        ConfigStoreUtils.getTopicsURIFromConfigStore(configClient, blackListTagUri, filterString, runtimeConfig).stream().filter((URI u) -> ConfigUtils.getBoolean(ConfigStoreUtils.getConfig(configClient, u, runtimeConfig), KafkaSource.TOPIC_BLACKLIST, false)).forEach(((URI u) -> blacklistedTopics.add(ConfigStoreUtils.getTopicNameFromURI(u))));
        return allTopics.stream().filter((KafkaTopic p) -> !blacklistedTopics.contains(p.getName())).collect(Collectors.toList());
    } else {
        log.warn("None of the blacklist or whitelist tags are provided");
        return allTopics;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ConfigClient(org.apache.gobblin.config.client.ConfigClient) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) URI(java.net.URI) State(org.apache.gobblin.configuration.State) ArrayList(java.util.ArrayList) List(java.util.List)

Example 79 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class KafkaSource method getWorkUnitForTopicPartition.

private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) {
    // Default to job level configurations
    Extract.TableType currentTableType = tableType;
    String currentExtractNamespace = extractNamespace;
    String currentExtractTableName = partition.getTopicName();
    boolean isCurrentFullExtract = isFullExtract;
    // Update to topic specific configurations if any
    if (topicSpecificState.isPresent()) {
        State topicState = topicSpecificState.get();
        if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) {
            currentTableType = Extract.TableType.valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY));
        }
        currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace);
        currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
        isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract);
    }
    Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName);
    if (isCurrentFullExtract) {
        extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
    }
    WorkUnit workUnit = WorkUnit.create(extract);
    workUnit.setProp(TOPIC_NAME, partition.getTopicName());
    addDatasetUrnOptionally(workUnit);
    workUnit.setProp(PARTITION_ID, partition.getId());
    workUnit.setProp(LEADER_ID, partition.getLeader().getId());
    workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
    workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime());
    workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime());
    workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset());
    // Add lineage info
    DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName());
    source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers);
    if (this.lineageInfo.isPresent()) {
        this.lineageInfo.get().setSource(source, workUnit);
    }
    LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
    return workUnit;
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 80 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class JdbcExtractorTest method testUnsignedInt.

/**
 * Test for the metadata query to see if the check for unsigned int is present
 */
@Test
public void testUnsignedInt() throws SchemaException {
    State state = new WorkUnitState();
    state.setId("id");
    MysqlExtractor mysqlExtractor = new MysqlExtractor((WorkUnitState) state);
    List<Command> commands = mysqlExtractor.getSchemaMetadata("db", "table");
    assertTrue(commands.get(0).getCommandType() == JdbcCommand.JdbcCommandType.QUERY);
    assertTrue(commands.get(0).getParams().get(0).contains("bigint"));
    assertTrue(commands.get(1).getCommandType() == JdbcCommand.JdbcCommandType.QUERYPARAMS);
    assertTrue(!commands.get(1).getParams().get(0).contains("unsigned"));
    // set option to promote unsigned int to bigint
    state.setProp(ConfigurationKeys.SOURCE_QUERYBASED_PROMOTE_UNSIGNED_INT_TO_BIGINT, "true");
    commands = mysqlExtractor.getSchemaMetadata("db", "table");
    assertTrue(commands.get(0).getCommandType() == JdbcCommand.JdbcCommandType.QUERY);
    assertTrue(commands.get(0).getParams().get(0).contains("bigint"));
    assertTrue(commands.get(1).getCommandType() == JdbcCommand.JdbcCommandType.QUERYPARAMS);
    assertTrue(commands.get(1).getParams().get(0).contains("unsigned"));
}
Also used : Command(org.apache.gobblin.source.extractor.extract.Command) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Aggregations

State (org.apache.gobblin.configuration.State)195 Test (org.testng.annotations.Test)103 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)74 SourceState (org.apache.gobblin.configuration.SourceState)38 Path (org.apache.hadoop.fs.Path)30 File (java.io.File)20 IOException (java.io.IOException)16 Map (java.util.Map)14 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)14 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)13 Properties (java.util.Properties)12 FinalState (org.apache.gobblin.util.FinalState)12 Configuration (org.apache.hadoop.conf.Configuration)12 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)9 Config (com.typesafe.config.Config)8 ArrayList (java.util.ArrayList)8 GenericRecord (org.apache.avro.generic.GenericRecord)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 FileInputStream (java.io.FileInputStream)6