Search in sources :

Example 1 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class ConfigInputPlugin method transaction.

@Override
public ConfigDiff transaction(ConfigSource config, InputPlugin.Control control) {
    final PluginTask task = config.loadConfig(PluginTask.class);
    final Schema schema = task.getSchemaConfig().toSchema();
    final List<List<List<JsonNode>>> values = task.getValues();
    final int taskCount = values.size();
    return resume(task.dump(), schema, taskCount, control);
}
Also used : Schema(org.embulk.spi.Schema) List(java.util.List) JsonNode(com.fasterxml.jackson.databind.JsonNode)

Example 2 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class RenameFilterPlugin method applyRegexReplaceRule.

private Schema applyRegexReplaceRule(Schema inputSchema, RegexReplaceRule rule) {
    final String match = rule.getMatch();
    final String replace = rule.getReplace();
    Schema.Builder builder = Schema.builder();
    for (Column column : inputSchema.getColumns()) {
        // TODO(dmikurube): Check if we need a kind of sanitization?
        try {
            builder.add(column.getName().replaceAll(match, replace), column.getType());
        } catch (PatternSyntaxException ex) {
            throw new ConfigException(ex);
        }
    }
    return builder.build();
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ConfigException(org.embulk.config.ConfigException) PatternSyntaxException(java.util.regex.PatternSyntaxException)

Example 3 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class RenameFilterPlugin method applyUniqueNumberSuffixRule.

/**
 * Resolves conflicting column names by suffixing numbers.
 *
 * Conflicts are resolved by the following rules. The rules should not be changed casually because changing the
 * rules breaks compatibility.
 *
 * 1. Count all duplicates in the original column names. Indexes are counted up per original column name.
 * 2. Fix new column names from the left to the right
 *   - Try to append the current index for the original column name (with truncation if requested (not implemented))
 *     - Fix the new name if no duplication is found with fixed column names on the left and original column names
 *     - Retry with an index incremented if a duplication is found with fixed column names on the left
 *
 * Examples:
 *     [c, c1, c1,   c2, c,   c3]
 * ==> [c, c1, c1_2, c2, c_2, c3]
 *
 * If a newly suffixed name newly conflicts with other columns, the index is just skipped. For example:
 *     [c, c,   c_0, c_1, c_2]
 * ==> [c, c_3, c_0, c_1, c_2]
 *
 * If truncation is requested simultaneously with uniqueness (not implemented), it should work like:
 *     [co, c, co  , c  , co  , c  , ..., co  , c  , co  , c   , co  , c   ]
 * ==> [co, c, co_2, c_2, co_3, c_3, ..., co_9, c_9, c_10, c_11, c_12, c_13] (max_length:4)
 *
 *     [co, co  , co  , ..., co  , c, c  , ..., c  , co  , c  , co  , c  , co  , c   ]
 * ==> [co, co_2, co_3, ..., co_9, c, c_2, ..., c_7, c_10, c_8, c_11, c_9, c_12, c_13] (max_length:4)
 *
 * Note that a delimiter should not be omitted. Recurring conflicts may confuse users.
 *     [c, c,  c,  ..., c,   c,   c,   c,   c1, c1,  c1]
 * NG: [c, c2, c3, ..., c10, c11, c12, c13, c1, c12, c13] (not unique!)
 * ==> [c, c2, c3, ..., c10, c11, c12, c13, c1, c14, c15] (confusing)
 */
private Schema applyUniqueNumberSuffixRule(Schema inputSchema, UniqueNumberSuffixRule rule) {
    final String delimiter = rule.getDelimiter();
    final Optional<Integer> digits = rule.getDigits();
    final Optional<Integer> maxLength = rule.getMaxLength();
    final int offset = rule.getOffset();
    // |delimiter| must consist of just 1 character to check quickly that it does not contain any digit.
    if (delimiter == null || delimiter.length() != 1 || Character.isDigit(delimiter.charAt(0))) {
        throw new ConfigException("\"delimiter\" in rule \"unique_number_suffix\" must contain just 1 non-digit character");
    }
    if (maxLength.isPresent() && maxLength.get() < minimumMaxLengthInUniqueNumberSuffix) {
        throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than " + (minimumMaxLengthInUniqueNumberSuffix - 1));
    }
    if (maxLength.isPresent() && digits.isPresent() && maxLength.get() < digits.get() + delimiter.length()) {
        throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than \"digits\"");
    }
    int digitsOfNumberOfColumns = Integer.toString(inputSchema.getColumnCount() + offset - 1).length();
    if (maxLength.isPresent() && maxLength.get() <= digitsOfNumberOfColumns) {
        throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
    }
    if (digits.isPresent() && digits.get() <= digitsOfNumberOfColumns) {
        throw new ConfigException("\"digits\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
    }
    // Columns should not be truncated here initially. Uniqueness should be identified before truncated.
    // Iterate for initial states.
    HashSet<String> originalColumnNames = new HashSet<>();
    HashMap<String, Integer> columnNameCountups = new HashMap<>();
    for (Column column : inputSchema.getColumns()) {
        originalColumnNames.add(column.getName());
        columnNameCountups.put(column.getName(), offset);
    }
    Schema.Builder outputBuilder = Schema.builder();
    HashSet<String> fixedColumnNames = new HashSet<>();
    for (Column column : inputSchema.getColumns()) {
        String truncatedName = column.getName();
        if (column.getName().length() > maxLength.or(Integer.MAX_VALUE)) {
            truncatedName = column.getName().substring(0, maxLength.get());
        }
        // Conflicts with original names do not matter here.
        if (!fixedColumnNames.contains(truncatedName)) {
            // The original name is counted up.
            columnNameCountups.put(column.getName(), columnNameCountups.get(column.getName()) + 1);
            // The truncated name is fixed.
            fixedColumnNames.add(truncatedName);
            outputBuilder.add(truncatedName, column.getType());
            continue;
        }
        int index = columnNameCountups.get(column.getName());
        String concatenatedName;
        do {
            // This can be replaced with String#format(Locale.ENGLISH, ...), but Java's String#format does not
            // have variable widths ("%*d" in C's printf). It cannot be very simple with String#format.
            String differentiatorString = Integer.toString(index);
            if (digits.isPresent() && (digits.get() > differentiatorString.length())) {
                differentiatorString = Strings.repeat("0", digits.get() - differentiatorString.length()) + differentiatorString;
            }
            differentiatorString = delimiter + differentiatorString;
            concatenatedName = column.getName() + differentiatorString;
            if (concatenatedName.length() > maxLength.or(Integer.MAX_VALUE)) {
                concatenatedName = column.getName().substring(0, maxLength.get() - differentiatorString.length()) + differentiatorString;
            }
            ++index;
        // Conflicts with original names matter when creating new names with suffixes.
        } while (fixedColumnNames.contains(concatenatedName) || originalColumnNames.contains(concatenatedName));
        // The original name is counted up.
        columnNameCountups.put(column.getName(), index);
        // The concatenated&truncated name is fixed.
        fixedColumnNames.add(concatenatedName);
        outputBuilder.add(concatenatedName, column.getType());
    }
    return outputBuilder.build();
}
Also used : HashMap(java.util.HashMap) Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ConfigException(org.embulk.config.ConfigException) HashSet(java.util.HashSet)

Example 4 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class TestCsvTokenizer method recoverFromQuotedSizeLimitExceededException.

@Test
public void recoverFromQuotedSizeLimitExceededException() throws Exception {
    config.set("max_quoted_size_limit", 12);
    reloadPluginTask();
    String[] lines = new String[] { "v1,v2", // this is a broken line and should be skipped
    "v3,\"0123", // this line should be not be skiped
    "v4,v5", // this line should be not be skiped
    "v6,v7" };
    FileInput input = newFileInputFromLines(task, lines);
    LineDecoder decoder = new LineDecoder(input, task);
    CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
    Schema schema = task.getSchemaConfig().toSchema();
    tokenizer.nextFile();
    assertTrue(tokenizer.nextRecord());
    assertEquals("v1", tokenizer.nextColumn());
    assertEquals("v2", tokenizer.nextColumn());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v3", tokenizer.nextColumn());
    try {
        tokenizer.nextColumn();
        fail();
    } catch (Exception e) {
        assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
    }
    assertEquals("v3,\"0123", tokenizer.skipCurrentLine());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v4", tokenizer.nextColumn());
    assertEquals("v5", tokenizer.nextColumn());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v6", tokenizer.nextColumn());
    assertEquals("v7", tokenizer.nextColumn());
}
Also used : Schema(org.embulk.spi.Schema) ListFileInput(org.embulk.spi.util.ListFileInput) FileInput(org.embulk.spi.FileInput) LineDecoder(org.embulk.spi.util.LineDecoder) Test(org.junit.Test)

Example 5 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class TestCsvTokenizer method parse.

private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input) {
    LineDecoder decoder = new LineDecoder(input, task);
    CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
    Schema schema = task.getSchemaConfig().toSchema();
    tokenizer.nextFile();
    List<List<String>> records = new ArrayList<>();
    while (tokenizer.nextRecord()) {
        List<String> record = new ArrayList<>();
        for (Column c : schema.getColumns()) {
            String v = tokenizer.nextColumnOrNull();
            record.add(v);
        }
        records.add(record);
    }
    return records;
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) LineDecoder(org.embulk.spi.util.LineDecoder)

Aggregations

Schema (org.embulk.spi.Schema)22 TaskSource (org.embulk.config.TaskSource)12 Column (org.embulk.spi.Column)10 ConfigSource (org.embulk.config.ConfigSource)9 ConfigException (org.embulk.config.ConfigException)8 List (java.util.List)7 FilterPlugin (org.embulk.spi.FilterPlugin)7 Test (org.junit.Test)6 ImmutableList (com.google.common.collect.ImmutableList)5 InputPlugin (org.embulk.spi.InputPlugin)5 SchemaConfigException (org.embulk.spi.SchemaConfigException)5 ArrayList (java.util.ArrayList)4 ConfigDiff (org.embulk.config.ConfigDiff)3 PageOutput (org.embulk.spi.PageOutput)3 HashMap (java.util.HashMap)2 TaskReport (org.embulk.config.TaskReport)2 ExecutorPlugin (org.embulk.spi.ExecutorPlugin)2 Page (org.embulk.spi.Page)2 LineDecoder (org.embulk.spi.util.LineDecoder)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1