use of org.embulk.config.ConfigException in project embulk by embulk.
the class RenameFilterPlugin method applyRegexReplaceRule.
private Schema applyRegexReplaceRule(Schema inputSchema, RegexReplaceRule rule) {
final String match = rule.getMatch();
final String replace = rule.getReplace();
Schema.Builder builder = Schema.builder();
for (Column column : inputSchema.getColumns()) {
// TODO(dmikurube): Check if we need a kind of sanitization?
try {
builder.add(column.getName().replaceAll(match, replace), column.getType());
} catch (PatternSyntaxException ex) {
throw new ConfigException(ex);
}
}
return builder.build();
}
use of org.embulk.config.ConfigException in project embulk by embulk.
the class RenameFilterPlugin method applyUniqueNumberSuffixRule.
/**
* Resolves conflicting column names by suffixing numbers.
*
* Conflicts are resolved by the following rules. The rules should not be changed casually because changing the
* rules breaks compatibility.
*
* 1. Count all duplicates in the original column names. Indexes are counted up per original column name.
* 2. Fix new column names from the left to the right
* - Try to append the current index for the original column name (with truncation if requested (not implemented))
* - Fix the new name if no duplication is found with fixed column names on the left and original column names
* - Retry with an index incremented if a duplication is found with fixed column names on the left
*
* Examples:
* [c, c1, c1, c2, c, c3]
* ==> [c, c1, c1_2, c2, c_2, c3]
*
* If a newly suffixed name newly conflicts with other columns, the index is just skipped. For example:
* [c, c, c_0, c_1, c_2]
* ==> [c, c_3, c_0, c_1, c_2]
*
* If truncation is requested simultaneously with uniqueness (not implemented), it should work like:
* [co, c, co , c , co , c , ..., co , c , co , c , co , c ]
* ==> [co, c, co_2, c_2, co_3, c_3, ..., co_9, c_9, c_10, c_11, c_12, c_13] (max_length:4)
*
* [co, co , co , ..., co , c, c , ..., c , co , c , co , c , co , c ]
* ==> [co, co_2, co_3, ..., co_9, c, c_2, ..., c_7, c_10, c_8, c_11, c_9, c_12, c_13] (max_length:4)
*
* Note that a delimiter should not be omitted. Recurring conflicts may confuse users.
* [c, c, c, ..., c, c, c, c, c1, c1, c1]
* NG: [c, c2, c3, ..., c10, c11, c12, c13, c1, c12, c13] (not unique!)
* ==> [c, c2, c3, ..., c10, c11, c12, c13, c1, c14, c15] (confusing)
*/
private Schema applyUniqueNumberSuffixRule(Schema inputSchema, UniqueNumberSuffixRule rule) {
final String delimiter = rule.getDelimiter();
final Optional<Integer> digits = rule.getDigits();
final Optional<Integer> maxLength = rule.getMaxLength();
final int offset = rule.getOffset();
// |delimiter| must consist of just 1 character to check quickly that it does not contain any digit.
if (delimiter == null || delimiter.length() != 1 || Character.isDigit(delimiter.charAt(0))) {
throw new ConfigException("\"delimiter\" in rule \"unique_number_suffix\" must contain just 1 non-digit character");
}
if (maxLength.isPresent() && maxLength.get() < minimumMaxLengthInUniqueNumberSuffix) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than " + (minimumMaxLengthInUniqueNumberSuffix - 1));
}
if (maxLength.isPresent() && digits.isPresent() && maxLength.get() < digits.get() + delimiter.length()) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than \"digits\"");
}
int digitsOfNumberOfColumns = Integer.toString(inputSchema.getColumnCount() + offset - 1).length();
if (maxLength.isPresent() && maxLength.get() <= digitsOfNumberOfColumns) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
}
if (digits.isPresent() && digits.get() <= digitsOfNumberOfColumns) {
throw new ConfigException("\"digits\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
}
// Columns should not be truncated here initially. Uniqueness should be identified before truncated.
// Iterate for initial states.
HashSet<String> originalColumnNames = new HashSet<>();
HashMap<String, Integer> columnNameCountups = new HashMap<>();
for (Column column : inputSchema.getColumns()) {
originalColumnNames.add(column.getName());
columnNameCountups.put(column.getName(), offset);
}
Schema.Builder outputBuilder = Schema.builder();
HashSet<String> fixedColumnNames = new HashSet<>();
for (Column column : inputSchema.getColumns()) {
String truncatedName = column.getName();
if (column.getName().length() > maxLength.or(Integer.MAX_VALUE)) {
truncatedName = column.getName().substring(0, maxLength.get());
}
// Conflicts with original names do not matter here.
if (!fixedColumnNames.contains(truncatedName)) {
// The original name is counted up.
columnNameCountups.put(column.getName(), columnNameCountups.get(column.getName()) + 1);
// The truncated name is fixed.
fixedColumnNames.add(truncatedName);
outputBuilder.add(truncatedName, column.getType());
continue;
}
int index = columnNameCountups.get(column.getName());
String concatenatedName;
do {
// This can be replaced with String#format(Locale.ENGLISH, ...), but Java's String#format does not
// have variable widths ("%*d" in C's printf). It cannot be very simple with String#format.
String differentiatorString = Integer.toString(index);
if (digits.isPresent() && (digits.get() > differentiatorString.length())) {
differentiatorString = Strings.repeat("0", digits.get() - differentiatorString.length()) + differentiatorString;
}
differentiatorString = delimiter + differentiatorString;
concatenatedName = column.getName() + differentiatorString;
if (concatenatedName.length() > maxLength.or(Integer.MAX_VALUE)) {
concatenatedName = column.getName().substring(0, maxLength.get() - differentiatorString.length()) + differentiatorString;
}
++index;
// Conflicts with original names matter when creating new names with suffixes.
} while (fixedColumnNames.contains(concatenatedName) || originalColumnNames.contains(concatenatedName));
// The original name is counted up.
columnNameCountups.put(column.getName(), index);
// The concatenated&truncated name is fixed.
fixedColumnNames.add(concatenatedName);
outputBuilder.add(concatenatedName, column.getType());
}
return outputBuilder.build();
}
use of org.embulk.config.ConfigException in project embulk by embulk.
the class TestRenameFilterPlugin method checkConfigExceptionIfUnknownRenamingOperatorName.
@Test
public void checkConfigExceptionIfUnknownRenamingOperatorName() {
ConfigSource pluginConfig = Exec.newConfigSource().set("rules", ImmutableList.of(ImmutableMap.of("rule", "some_unknown_renaming_operator")));
try {
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
public void run(TaskSource task, Schema schema) {
}
});
fail();
} catch (Throwable t) {
assertTrue(t instanceof ConfigException);
}
}
use of org.embulk.config.ConfigException in project embulk by embulk.
the class TestRenameFilterPlugin method checkConfigExceptionIfUnknownListTypeOfRenamingOperator.
@Test
public void checkConfigExceptionIfUnknownListTypeOfRenamingOperator() {
// A list [] shouldn't come as a renaming rule.
ConfigSource pluginConfig = Exec.newConfigSource().set("rules", ImmutableList.of(ImmutableList.of("listed_operator1", "listed_operator2")));
try {
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
public void run(TaskSource task, Schema schema) {
}
});
fail();
} catch (Throwable t) {
assertTrue(t instanceof ConfigException);
}
}
use of org.embulk.config.ConfigException in project embulk by embulk.
the class TestRenameFilterPlugin method checkConfigExceptionIfUnknownStringTypeOfRenamingOperator.
@Test
public void checkConfigExceptionIfUnknownStringTypeOfRenamingOperator() {
// A simple string shouldn't come as a renaming rule.
ConfigSource pluginConfig = Exec.newConfigSource().set("rules", ImmutableList.of("string_rule"));
try {
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
public void run(TaskSource task, Schema schema) {
}
});
fail();
} catch (Throwable t) {
assertTrue(t instanceof ConfigException);
}
}
Aggregations