Search in sources :

Example 11 with Column

use of org.embulk.spi.Column in project embulk by embulk.

the class RemoveColumnsFilterPlugin method transaction.

@Override
public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control) {
    PluginTask task = config.loadConfig(PluginTask.class);
    // validate remove: and keep:
    if (task.getRemove().isPresent() && task.getKeep().isPresent()) {
        throw new ConfigException("remove: and keep: must not be multi-select");
    }
    if (!task.getRemove().isPresent() && !task.getKeep().isPresent()) {
        throw new ConfigException("Must require remove: or keep:");
    }
    boolean acceptUnmatchedColumns = task.getAcceptUnmatchedColumns();
    ImmutableList.Builder<Column> outputColumns = ImmutableList.builder();
    int index = 0;
    int[] indexMapping = new int[inputSchema.size()];
    for (int i = 0; i < indexMapping.length; i++) {
        indexMapping[i] = -1;
    }
    if (task.getRemove().isPresent()) {
        // specify remove:
        List<String> removeColumns = getExistentColumns(inputSchema, task.getRemove().get(), acceptUnmatchedColumns);
        for (Column column : inputSchema.getColumns()) {
            if (!removeColumns.contains(column.getName())) {
                outputColumns.add(new Column(index, column.getName(), column.getType()));
                indexMapping[column.getIndex()] = index;
                index++;
            }
        }
    } else {
        // specify keep:
        List<String> keepColumns = getExistentColumns(inputSchema, task.getKeep().get(), acceptUnmatchedColumns);
        for (Column column : inputSchema.getColumns()) {
            if (keepColumns.contains(column.getName())) {
                outputColumns.add(new Column(index, column.getName(), column.getType()));
                indexMapping[column.getIndex()] = index;
                index++;
            }
        }
    }
    task.setIndexMapping(indexMapping);
    control.run(task.dump(), new Schema(outputColumns.build()));
}
Also used : Column(org.embulk.spi.Column) ImmutableList(com.google.common.collect.ImmutableList) Schema(org.embulk.spi.Schema) ConfigException(org.embulk.config.ConfigException) SchemaConfigException(org.embulk.spi.SchemaConfigException)

Example 12 with Column

use of org.embulk.spi.Column in project embulk by embulk.

the class CsvFormatterPlugin method open.

@Override
public PageOutput open(TaskSource taskSource, final Schema schema, FileOutput output) {
    final PluginTask task = taskSource.loadTask(PluginTask.class);
    final LineEncoder encoder = new LineEncoder(output, task);
    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
    final char delimiter = task.getDelimiterChar();
    final QuotePolicy quotePolicy = task.getQuotePolicy();
    final char quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
    final char escape = task.getEscapeChar().or(quotePolicy == QuotePolicy.NONE ? '\\' : quote);
    final String newlineInField = task.getNewlineInField().getString();
    final String nullString = task.getNullString();
    // create a file
    encoder.nextFile();
    // write header
    if (task.getHeaderLine()) {
        writeHeader(schema, encoder, delimiter, quotePolicy, quote, escape, newlineInField, nullString);
    }
    return new PageOutput() {

        private final PageReader pageReader = new PageReader(schema);

        private final String delimiterString = String.valueOf(delimiter);

        public void add(Page page) {
            pageReader.setPage(page);
            while (pageReader.nextRecord()) {
                schema.visitColumns(new ColumnVisitor() {

                    public void booleanColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Boolean.toString(pageReader.getBoolean(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void longColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Long.toString(pageReader.getLong(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void doubleColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Double.toString(pageReader.getDouble(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void stringColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(pageReader.getString(column));
                        } else {
                            addNullString();
                        }
                    }

                    public void timestampColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            Timestamp value = pageReader.getTimestamp(column);
                            addValue(timestampFormatters[column.getIndex()].format(value));
                        } else {
                            addNullString();
                        }
                    }

                    public void jsonColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            Value value = pageReader.getJson(column);
                            addValue(value.toJson());
                        } else {
                            addNullString();
                        }
                    }

                    private void addDelimiter(Column column) {
                        if (column.getIndex() != 0) {
                            encoder.addText(delimiterString);
                        }
                    }

                    private void addValue(String v) {
                        encoder.addText(setEscapeAndQuoteValue(v, delimiter, quotePolicy, quote, escape, newlineInField, nullString));
                    }

                    private void addNullString() {
                        encoder.addText(nullString);
                    }
                });
                encoder.addNewLine();
            }
        }

        public void finish() {
            encoder.finish();
        }

        public void close() {
            encoder.close();
        }
    };
}
Also used : TimestampFormatter(org.embulk.spi.time.TimestampFormatter) LineEncoder(org.embulk.spi.util.LineEncoder) PageReader(org.embulk.spi.PageReader) Page(org.embulk.spi.Page) Timestamp(org.embulk.spi.time.Timestamp) ColumnVisitor(org.embulk.spi.ColumnVisitor) PageOutput(org.embulk.spi.PageOutput) Column(org.embulk.spi.Column) Value(org.msgpack.value.Value)

Example 13 with Column

use of org.embulk.spi.Column in project embulk by embulk.

the class RenameFilterPlugin method applyFirstCharacterTypesRule.

private Schema applyFirstCharacterTypesRule(Schema inputSchema, FirstCharacterTypesRule rule) {
    final Optional<String> replace = rule.getReplace();
    final List<String> passTypes = rule.getPassTypes();
    final String passCharacters = rule.getPassCharacters();
    final Optional<String> prefix = rule.getPrefix();
    if (replace.isPresent() && replace.get().length() != 1) {
        throw new ConfigException("\"replace\" in \"first_character_types\" must contain just 1 character if specified");
    }
    if (prefix.isPresent() && prefix.get().length() != 1) {
        throw new ConfigException("\"prefix\" in \"first_character_types\" must contain just 1 character if specified");
    }
    if (prefix.isPresent() && replace.isPresent()) {
        throw new ConfigException("\"replace\" and \"prefix\" in \"first_character_types\" must not be specified together");
    }
    if ((!prefix.isPresent()) && (!replace.isPresent())) {
        throw new ConfigException("Either of \"replace\" or \"prefix\" must be specified in \"first_character_types\"");
    }
    // TODO(dmikurube): Revisit this for better escaping.
    if (passCharacters.contains("\\E")) {
        throw new ConfigException("\"pass_characters\" in \"first_character_types\" must not contain \"\\E\"");
    }
    StringBuilder regexBuilder = new StringBuilder();
    regexBuilder.append("^[^");
    for (String target : passTypes) {
        if (CHARACTER_TYPE_KEYWORDS.containsKey(target)) {
            regexBuilder.append(CHARACTER_TYPE_KEYWORDS.get(target));
        } else {
            throw new ConfigException("\"" + target + "\" is an unknown character type keyword");
        }
    }
    if (!passCharacters.isEmpty()) {
        regexBuilder.append("\\Q");
        regexBuilder.append(passCharacters);
        regexBuilder.append("\\E");
    }
    regexBuilder.append("].*");
    Schema.Builder schemaBuidler = Schema.builder();
    for (Column column : inputSchema.getColumns()) {
        String name = column.getName();
        if (name.matches(regexBuilder.toString())) {
            if (replace.isPresent()) {
                name = replace.get() + name.substring(1);
            } else if (prefix.isPresent()) {
                name = prefix.get() + name;
            }
        }
        schemaBuidler.add(name, column.getType());
    }
    return schemaBuidler.build();
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ConfigException(org.embulk.config.ConfigException)

Example 14 with Column

use of org.embulk.spi.Column in project embulk by embulk.

the class RenameFilterPlugin method applyCharacterTypesRule.

private Schema applyCharacterTypesRule(Schema inputSchema, CharacterTypesRule rule) {
    final List<String> passTypes = rule.getPassTypes();
    final String passCharacters = rule.getPassCharacters();
    final String replace = rule.getReplace();
    if (replace.isEmpty()) {
        throw new ConfigException("\"replace\" in \"character_types\" must not be explicitly empty");
    }
    if (replace.length() != 1) {
        throw new ConfigException("\"replace\" in \"character_types\" must contain just 1 character");
    }
    // TODO(dmikurube): Revisit this for better escaping.
    if (passCharacters.contains("\\E")) {
        throw new ConfigException("\"pass_characters\" in \"character_types\" must not contain \"\\E\"");
    }
    StringBuilder regexBuilder = new StringBuilder();
    regexBuilder.append("[^");
    for (String target : passTypes) {
        if (CHARACTER_TYPE_KEYWORDS.containsKey(target)) {
            regexBuilder.append(CHARACTER_TYPE_KEYWORDS.get(target));
        } else {
            throw new ConfigException("\"" + target + "\" is an unknown character type keyword");
        }
    }
    if (!passCharacters.isEmpty()) {
        regexBuilder.append("\\Q");
        regexBuilder.append(passCharacters);
        regexBuilder.append("\\E");
    }
    regexBuilder.append("]");
    Schema.Builder schemaBuilder = Schema.builder();
    for (Column column : inputSchema.getColumns()) {
        schemaBuilder.add(column.getName().replaceAll(regexBuilder.toString(), replace), column.getType());
    }
    return schemaBuilder.build();
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ConfigException(org.embulk.config.ConfigException)

Example 15 with Column

use of org.embulk.spi.Column in project embulk by embulk.

the class RenameFilterPlugin method transaction.

@Override
public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control) {
    PluginTask task = config.loadConfig(PluginTask.class);
    Map<String, String> renameMap = task.getRenameMap();
    List<ConfigSource> rulesList = task.getRulesList();
    // Check if the given column in "columns" exists or not.
    for (String columnName : renameMap.keySet()) {
        // throws SchemaConfigException
        inputSchema.lookupColumn(columnName);
    }
    // Rename by "columns": to be applied before "rules".
    Schema.Builder builder = Schema.builder();
    for (Column column : inputSchema.getColumns()) {
        String name = column.getName();
        if (renameMap.containsKey(name)) {
            name = renameMap.get(name);
        }
        builder.add(name, column.getType());
    }
    Schema intermediateSchema = builder.build();
    // Rename by "rules".
    Schema outputSchema = intermediateSchema;
    for (ConfigSource rule : rulesList) {
        outputSchema = applyRule(rule, intermediateSchema);
        intermediateSchema = outputSchema;
    }
    control.run(task.dump(), outputSchema);
}
Also used : ConfigSource(org.embulk.config.ConfigSource) Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema)

Aggregations

Column (org.embulk.spi.Column)16 Schema (org.embulk.spi.Schema)10 ConfigException (org.embulk.config.ConfigException)5 ConfigSource (org.embulk.config.ConfigSource)3 ColumnVisitor (org.embulk.spi.ColumnVisitor)3 DataException (org.embulk.spi.DataException)3 PageBuilder (org.embulk.spi.PageBuilder)3 JsonParseException (org.embulk.spi.json.JsonParseException)3 JsonParser (org.embulk.spi.json.JsonParser)3 ImmutableList (com.google.common.collect.ImmutableList)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 TaskSource (org.embulk.config.TaskSource)2 FilterPlugin (org.embulk.spi.FilterPlugin)2 TimestampFormatter (org.embulk.spi.time.TimestampFormatter)2 TimestampParseException (org.embulk.spi.time.TimestampParseException)2 TimestampParser (org.embulk.spi.time.TimestampParser)2 LineDecoder (org.embulk.spi.util.LineDecoder)2 Value (org.msgpack.value.Value)2