Search in sources :

Example 1 with PageBuilder

use of org.embulk.spi.PageBuilder in project embulk by embulk.

the class ConfigInputPlugin method run.

@Override
public TaskReport run(TaskSource taskSource, Schema schema, int taskIndex, PageOutput output) {
    final PluginTask task = taskSource.loadTask(PluginTask.class);
    final List<List<JsonNode>> taskValues = task.getValues().get(taskIndex);
    final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
    final JsonParser jsonParser = new JsonParser();
    try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
        for (final List<JsonNode> rowValues : taskValues) {
            schema.visitColumns(new ColumnVisitor() {

                public void booleanColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        pageBuilder.setBoolean(column, value.asBoolean());
                    }
                }

                public void longColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        pageBuilder.setLong(column, value.asLong());
                    }
                }

                public void doubleColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        pageBuilder.setDouble(column, value.asDouble());
                    }
                }

                public void stringColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        pageBuilder.setString(column, value.asText());
                    }
                }

                public void timestampColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        try {
                            pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(value.asText()));
                        } catch (TimestampParseException ex) {
                            throw new DataException(ex);
                        }
                    }
                }

                public void jsonColumn(Column column) {
                    final JsonNode value = rowValues.get(column.getIndex());
                    if (value == null || value.isNull()) {
                        pageBuilder.setNull(column);
                    } else {
                        try {
                            pageBuilder.setJson(column, jsonParser.parse(value.toString()));
                        } catch (JsonParseException ex) {
                            throw new DataException(ex);
                        }
                    }
                }
            });
            pageBuilder.addRecord();
        }
        pageBuilder.finish();
    }
    return Exec.newTaskReport();
}
Also used : TimestampParser(org.embulk.spi.time.TimestampParser) JsonNode(com.fasterxml.jackson.databind.JsonNode) PageBuilder(org.embulk.spi.PageBuilder) JsonParseException(org.embulk.spi.json.JsonParseException) TimestampParseException(org.embulk.spi.time.TimestampParseException) DataException(org.embulk.spi.DataException) ColumnVisitor(org.embulk.spi.ColumnVisitor) Column(org.embulk.spi.Column) List(java.util.List) JsonParser(org.embulk.spi.json.JsonParser)

Example 2 with PageBuilder

use of org.embulk.spi.PageBuilder in project embulk by embulk.

the class CsvParserPlugin method run.

@Override
public void run(TaskSource taskSource, final Schema schema, FileInput input, PageOutput output) {
    PluginTask task = taskSource.loadTask(PluginTask.class);
    final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
    final JsonParser jsonParser = new JsonParser();
    final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
    final boolean allowOptionalColumns = task.getAllowOptionalColumns();
    final boolean allowExtraColumns = task.getAllowExtraColumns();
    final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
    final int skipHeaderLines = task.getSkipHeaderLines();
    try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
        while (tokenizer.nextFile()) {
            // skip the header lines for each file
            for (int skipHeaderLineNumber = skipHeaderLines; skipHeaderLineNumber > 0; skipHeaderLineNumber--) {
                if (!tokenizer.skipHeaderLine()) {
                    break;
                }
            }
            if (!tokenizer.nextRecord()) {
                // empty file
                continue;
            }
            while (true) {
                boolean hasNextRecord;
                try {
                    schema.visitColumns(new ColumnVisitor() {

                        public void booleanColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
                            }
                        }

                        public void longColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setLong(column, Long.parseLong(v));
                                } catch (NumberFormatException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void doubleColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setDouble(column, Double.parseDouble(v));
                                } catch (NumberFormatException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void stringColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                pageBuilder.setString(column, v);
                            }
                        }

                        public void timestampColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
                                } catch (TimestampParseException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void jsonColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setJson(column, jsonParser.parse(v));
                                } catch (JsonParseException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        private String nextColumn() {
                            if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
                                // TODO warning
                                return null;
                            }
                            return tokenizer.nextColumnOrNull();
                        }
                    });
                    try {
                        hasNextRecord = tokenizer.nextRecord();
                    } catch (CsvTokenizer.TooManyColumnsException ex) {
                        if (allowExtraColumns) {
                            String tooManyColumnsLine = tokenizer.skipCurrentLine();
                            // TODO warning
                            hasNextRecord = tokenizer.nextRecord();
                        } else {
                            // this line will be skipped at the following catch section
                            throw ex;
                        }
                    }
                    pageBuilder.addRecord();
                } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
                    String skippedLine = tokenizer.skipCurrentLine();
                    long lineNumber = tokenizer.getCurrentLineNumber();
                    if (stopOnInvalidRecord) {
                        throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
                    }
                    log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
                    // exec.notice().skippedLine(skippedLine);
                    hasNextRecord = tokenizer.nextRecord();
                }
                if (!hasNextRecord) {
                    break;
                }
            }
        }
        pageBuilder.finish();
    }
}
Also used : TimestampParser(org.embulk.spi.time.TimestampParser) PageBuilder(org.embulk.spi.PageBuilder) JsonParseException(org.embulk.spi.json.JsonParseException) TimestampParseException(org.embulk.spi.time.TimestampParseException) DataException(org.embulk.spi.DataException) ColumnVisitor(org.embulk.spi.ColumnVisitor) Column(org.embulk.spi.Column) LineDecoder(org.embulk.spi.util.LineDecoder) JsonParser(org.embulk.spi.json.JsonParser)

Example 3 with PageBuilder

use of org.embulk.spi.PageBuilder in project embulk by embulk.

the class JsonParserPlugin method run.

@Override
public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
    PluginTask task = taskSource.loadTask(PluginTask.class);
    final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
    // record column
    final Column column = schema.getColumn(0);
    try (PageBuilder pageBuilder = newPageBuilder(schema, output);
        FileInputInputStream in = new FileInputInputStream(input)) {
        while (in.nextFile()) {
            boolean evenOneJsonParsed = false;
            try (JsonParser.Stream stream = newJsonStream(in, task)) {
                Value value;
                while ((value = stream.next()) != null) {
                    try {
                        if (!value.isMapValue()) {
                            throw new JsonRecordValidateException(String.format("A Json record must not represent map value but it's %s", value.getValueType().name()));
                        }
                        pageBuilder.setJson(column, value);
                        pageBuilder.addRecord();
                        evenOneJsonParsed = true;
                    } catch (JsonRecordValidateException e) {
                        if (stopOnInvalidRecord) {
                            throw new DataException(String.format("Invalid record: %s", value.toJson()), e);
                        }
                        log.warn(String.format("Skipped record (%s): %s", e.getMessage(), value.toJson()));
                    }
                }
            } catch (IOException | JsonParseException e) {
                if (Exec.isPreview() && evenOneJsonParsed) {
                    // ignore in preview if at least one JSON is already parsed.
                    break;
                }
                throw new DataException(e);
            }
        }
        pageBuilder.finish();
    }
}
Also used : PageBuilder(org.embulk.spi.PageBuilder) IOException(java.io.IOException) JsonParseException(org.embulk.spi.json.JsonParseException) DataException(org.embulk.spi.DataException) FileInputInputStream(org.embulk.spi.util.FileInputInputStream) Column(org.embulk.spi.Column) Value(org.msgpack.value.Value) JsonParser(org.embulk.spi.json.JsonParser)

Example 4 with PageBuilder

use of org.embulk.spi.PageBuilder in project embulk by embulk.

the class RemoveColumnsFilterPlugin method open.

@Override
public PageOutput open(TaskSource taskSource, Schema inputSchema, Schema outputSchema, PageOutput output) {
    PluginTask task = taskSource.loadTask(PluginTask.class);
    PageReader pageReader = new PageReader(inputSchema);
    PageBuilder pageBuilder = new PageBuilder(getBufferAllocator(), outputSchema, output);
    return new PageConverter(pageReader, pageBuilder, task.getIndexMapping());
}
Also used : PageReader(org.embulk.spi.PageReader) PageBuilder(org.embulk.spi.PageBuilder)

Aggregations

PageBuilder (org.embulk.spi.PageBuilder)4 Column (org.embulk.spi.Column)3 DataException (org.embulk.spi.DataException)3 JsonParseException (org.embulk.spi.json.JsonParseException)3 JsonParser (org.embulk.spi.json.JsonParser)3 ColumnVisitor (org.embulk.spi.ColumnVisitor)2 TimestampParseException (org.embulk.spi.time.TimestampParseException)2 TimestampParser (org.embulk.spi.time.TimestampParser)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 IOException (java.io.IOException)1 List (java.util.List)1 PageReader (org.embulk.spi.PageReader)1 FileInputInputStream (org.embulk.spi.util.FileInputInputStream)1 LineDecoder (org.embulk.spi.util.LineDecoder)1 Value (org.msgpack.value.Value)1