use of org.embulk.spi.Column in project embulk by embulk.
the class RenameFilterPlugin method applyRegexReplaceRule.
private Schema applyRegexReplaceRule(Schema inputSchema, RegexReplaceRule rule) {
final String match = rule.getMatch();
final String replace = rule.getReplace();
Schema.Builder builder = Schema.builder();
for (Column column : inputSchema.getColumns()) {
// TODO(dmikurube): Check if we need a kind of sanitization?
try {
builder.add(column.getName().replaceAll(match, replace), column.getType());
} catch (PatternSyntaxException ex) {
throw new ConfigException(ex);
}
}
return builder.build();
}
use of org.embulk.spi.Column in project embulk by embulk.
the class RenameFilterPlugin method applyUniqueNumberSuffixRule.
/**
* Resolves conflicting column names by suffixing numbers.
*
* Conflicts are resolved by the following rules. The rules should not be changed casually because changing the
* rules breaks compatibility.
*
* 1. Count all duplicates in the original column names. Indexes are counted up per original column name.
* 2. Fix new column names from the left to the right
* - Try to append the current index for the original column name (with truncation if requested (not implemented))
* - Fix the new name if no duplication is found with fixed column names on the left and original column names
* - Retry with an index incremented if a duplication is found with fixed column names on the left
*
* Examples:
* [c, c1, c1, c2, c, c3]
* ==> [c, c1, c1_2, c2, c_2, c3]
*
* If a newly suffixed name newly conflicts with other columns, the index is just skipped. For example:
* [c, c, c_0, c_1, c_2]
* ==> [c, c_3, c_0, c_1, c_2]
*
* If truncation is requested simultaneously with uniqueness (not implemented), it should work like:
* [co, c, co , c , co , c , ..., co , c , co , c , co , c ]
* ==> [co, c, co_2, c_2, co_3, c_3, ..., co_9, c_9, c_10, c_11, c_12, c_13] (max_length:4)
*
* [co, co , co , ..., co , c, c , ..., c , co , c , co , c , co , c ]
* ==> [co, co_2, co_3, ..., co_9, c, c_2, ..., c_7, c_10, c_8, c_11, c_9, c_12, c_13] (max_length:4)
*
* Note that a delimiter should not be omitted. Recurring conflicts may confuse users.
* [c, c, c, ..., c, c, c, c, c1, c1, c1]
* NG: [c, c2, c3, ..., c10, c11, c12, c13, c1, c12, c13] (not unique!)
* ==> [c, c2, c3, ..., c10, c11, c12, c13, c1, c14, c15] (confusing)
*/
private Schema applyUniqueNumberSuffixRule(Schema inputSchema, UniqueNumberSuffixRule rule) {
final String delimiter = rule.getDelimiter();
final Optional<Integer> digits = rule.getDigits();
final Optional<Integer> maxLength = rule.getMaxLength();
final int offset = rule.getOffset();
// |delimiter| must consist of just 1 character to check quickly that it does not contain any digit.
if (delimiter == null || delimiter.length() != 1 || Character.isDigit(delimiter.charAt(0))) {
throw new ConfigException("\"delimiter\" in rule \"unique_number_suffix\" must contain just 1 non-digit character");
}
if (maxLength.isPresent() && maxLength.get() < minimumMaxLengthInUniqueNumberSuffix) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than " + (minimumMaxLengthInUniqueNumberSuffix - 1));
}
if (maxLength.isPresent() && digits.isPresent() && maxLength.get() < digits.get() + delimiter.length()) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than \"digits\"");
}
int digitsOfNumberOfColumns = Integer.toString(inputSchema.getColumnCount() + offset - 1).length();
if (maxLength.isPresent() && maxLength.get() <= digitsOfNumberOfColumns) {
throw new ConfigException("\"max_length\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
}
if (digits.isPresent() && digits.get() <= digitsOfNumberOfColumns) {
throw new ConfigException("\"digits\" in rule \"unique_number_suffix\" must be larger than digits of ((number of columns) + \"offset\" - 1)");
}
// Columns should not be truncated here initially. Uniqueness should be identified before truncated.
// Iterate for initial states.
HashSet<String> originalColumnNames = new HashSet<>();
HashMap<String, Integer> columnNameCountups = new HashMap<>();
for (Column column : inputSchema.getColumns()) {
originalColumnNames.add(column.getName());
columnNameCountups.put(column.getName(), offset);
}
Schema.Builder outputBuilder = Schema.builder();
HashSet<String> fixedColumnNames = new HashSet<>();
for (Column column : inputSchema.getColumns()) {
String truncatedName = column.getName();
if (column.getName().length() > maxLength.or(Integer.MAX_VALUE)) {
truncatedName = column.getName().substring(0, maxLength.get());
}
// Conflicts with original names do not matter here.
if (!fixedColumnNames.contains(truncatedName)) {
// The original name is counted up.
columnNameCountups.put(column.getName(), columnNameCountups.get(column.getName()) + 1);
// The truncated name is fixed.
fixedColumnNames.add(truncatedName);
outputBuilder.add(truncatedName, column.getType());
continue;
}
int index = columnNameCountups.get(column.getName());
String concatenatedName;
do {
// This can be replaced with String#format(Locale.ENGLISH, ...), but Java's String#format does not
// have variable widths ("%*d" in C's printf). It cannot be very simple with String#format.
String differentiatorString = Integer.toString(index);
if (digits.isPresent() && (digits.get() > differentiatorString.length())) {
differentiatorString = Strings.repeat("0", digits.get() - differentiatorString.length()) + differentiatorString;
}
differentiatorString = delimiter + differentiatorString;
concatenatedName = column.getName() + differentiatorString;
if (concatenatedName.length() > maxLength.or(Integer.MAX_VALUE)) {
concatenatedName = column.getName().substring(0, maxLength.get() - differentiatorString.length()) + differentiatorString;
}
++index;
// Conflicts with original names matter when creating new names with suffixes.
} while (fixedColumnNames.contains(concatenatedName) || originalColumnNames.contains(concatenatedName));
// The original name is counted up.
columnNameCountups.put(column.getName(), index);
// The concatenated&truncated name is fixed.
fixedColumnNames.add(concatenatedName);
outputBuilder.add(concatenatedName, column.getType());
}
return outputBuilder.build();
}
use of org.embulk.spi.Column in project embulk by embulk.
the class TestCsvTokenizer method parse.
private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input) {
LineDecoder decoder = new LineDecoder(input, task);
CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
Schema schema = task.getSchemaConfig().toSchema();
tokenizer.nextFile();
List<List<String>> records = new ArrayList<>();
while (tokenizer.nextRecord()) {
List<String> record = new ArrayList<>();
for (Column c : schema.getColumns()) {
String v = tokenizer.nextColumnOrNull();
record.add(v);
}
records.add(record);
}
return records;
}
use of org.embulk.spi.Column in project embulk by embulk.
the class TestRenameFilterPlugin method checkUniqueNumberSuffixRuleInternal.
private void checkUniqueNumberSuffixRuleInternal(final String[] originalColumnNames, final String[] expectedColumnNames, String delimiter, int digits, int max_length) {
Schema.Builder originalSchemaBuilder = Schema.builder();
for (String originalColumnName : originalColumnNames) {
originalSchemaBuilder.add(originalColumnName, STRING);
}
final Schema originalSchema = originalSchemaBuilder.build();
HashMap<String, Object> parameters = new HashMap<>();
parameters.put("rule", "unique_number_suffix");
if (!delimiter.equals(DEFAULT)) {
parameters.put("delimiter", delimiter);
}
if (digits >= 0) {
parameters.put("digits", digits);
}
if (max_length != -1) {
parameters.put("max_length", max_length);
}
ConfigSource pluginConfig = Exec.newConfigSource().set("rules", ImmutableList.of(ImmutableMap.copyOf(parameters)));
filter.transaction(pluginConfig, originalSchema, new FilterPlugin.Control() {
@Override
public void run(TaskSource task, Schema newSchema) {
ArrayList<String> resolvedColumnNamesList = new ArrayList<>(newSchema.size());
for (Column resolvedColumn : newSchema.getColumns()) {
resolvedColumnNamesList.add(resolvedColumn.getName());
}
String[] resolvedColumnNames = Iterables.toArray(resolvedColumnNamesList, String.class);
assertEquals(expectedColumnNames, resolvedColumnNames);
for (int i = 0; i < expectedColumnNames.length; ++i) {
Column original = originalSchema.getColumn(i);
Column resolved = newSchema.getColumn(i);
assertEquals(original.getType(), resolved.getType());
}
}
});
}
use of org.embulk.spi.Column in project embulk by embulk.
the class TestRenameFilterPlugin method checkRenaming.
@Test
public void checkRenaming() {
ConfigSource pluginConfig = Exec.newConfigSource().set("columns", ImmutableMap.of("_c0", "_c0_new"));
filter.transaction(pluginConfig, SCHEMA, new FilterPlugin.Control() {
@Override
public void run(TaskSource task, Schema newSchema) {
// _c0 -> _c0_new
Column old0 = SCHEMA.getColumn(0);
Column new0 = newSchema.getColumn(0);
assertEquals("_c0_new", new0.getName());
assertEquals(old0.getType(), new0.getType());
// _c1 is not changed
Column old1 = SCHEMA.getColumn(1);
Column new1 = newSchema.getColumn(1);
assertEquals("_c1", new1.getName());
assertEquals(old1.getType(), new1.getType());
}
});
}
Aggregations