Search in sources :

Example 1 with LineDecoder

use of org.embulk.spi.util.LineDecoder in project embulk by embulk.

the class CsvParserPlugin method run.

@Override
public void run(TaskSource taskSource, final Schema schema, FileInput input, PageOutput output) {
    PluginTask task = taskSource.loadTask(PluginTask.class);
    final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
    final JsonParser jsonParser = new JsonParser();
    final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
    final boolean allowOptionalColumns = task.getAllowOptionalColumns();
    final boolean allowExtraColumns = task.getAllowExtraColumns();
    final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
    final int skipHeaderLines = task.getSkipHeaderLines();
    try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
        while (tokenizer.nextFile()) {
            // skip the header lines for each file
            for (int skipHeaderLineNumber = skipHeaderLines; skipHeaderLineNumber > 0; skipHeaderLineNumber--) {
                if (!tokenizer.skipHeaderLine()) {
                    break;
                }
            }
            if (!tokenizer.nextRecord()) {
                // empty file
                continue;
            }
            while (true) {
                boolean hasNextRecord;
                try {
                    schema.visitColumns(new ColumnVisitor() {

                        public void booleanColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
                            }
                        }

                        public void longColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setLong(column, Long.parseLong(v));
                                } catch (NumberFormatException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void doubleColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setDouble(column, Double.parseDouble(v));
                                } catch (NumberFormatException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void stringColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                pageBuilder.setString(column, v);
                            }
                        }

                        public void timestampColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
                                } catch (TimestampParseException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        public void jsonColumn(Column column) {
                            String v = nextColumn();
                            if (v == null) {
                                pageBuilder.setNull(column);
                            } else {
                                try {
                                    pageBuilder.setJson(column, jsonParser.parse(v));
                                } catch (JsonParseException e) {
                                    // TODO support default value
                                    throw new CsvRecordValidateException(e);
                                }
                            }
                        }

                        private String nextColumn() {
                            if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
                                // TODO warning
                                return null;
                            }
                            return tokenizer.nextColumnOrNull();
                        }
                    });
                    try {
                        hasNextRecord = tokenizer.nextRecord();
                    } catch (CsvTokenizer.TooManyColumnsException ex) {
                        if (allowExtraColumns) {
                            String tooManyColumnsLine = tokenizer.skipCurrentLine();
                            // TODO warning
                            hasNextRecord = tokenizer.nextRecord();
                        } else {
                            // this line will be skipped at the following catch section
                            throw ex;
                        }
                    }
                    pageBuilder.addRecord();
                } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
                    String skippedLine = tokenizer.skipCurrentLine();
                    long lineNumber = tokenizer.getCurrentLineNumber();
                    if (stopOnInvalidRecord) {
                        throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
                    }
                    log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
                    // exec.notice().skippedLine(skippedLine);
                    hasNextRecord = tokenizer.nextRecord();
                }
                if (!hasNextRecord) {
                    break;
                }
            }
        }
        pageBuilder.finish();
    }
}
Also used : TimestampParser(org.embulk.spi.time.TimestampParser) PageBuilder(org.embulk.spi.PageBuilder) JsonParseException(org.embulk.spi.json.JsonParseException) TimestampParseException(org.embulk.spi.time.TimestampParseException) DataException(org.embulk.spi.DataException) ColumnVisitor(org.embulk.spi.ColumnVisitor) Column(org.embulk.spi.Column) LineDecoder(org.embulk.spi.util.LineDecoder) JsonParser(org.embulk.spi.json.JsonParser)

Example 2 with LineDecoder

use of org.embulk.spi.util.LineDecoder in project embulk by embulk.

the class TestCsvTokenizer method recoverFromQuotedSizeLimitExceededException.

@Test
public void recoverFromQuotedSizeLimitExceededException() throws Exception {
    config.set("max_quoted_size_limit", 12);
    reloadPluginTask();
    String[] lines = new String[] { "v1,v2", // this is a broken line and should be skipped
    "v3,\"0123", // this line should be not be skiped
    "v4,v5", // this line should be not be skiped
    "v6,v7" };
    FileInput input = newFileInputFromLines(task, lines);
    LineDecoder decoder = new LineDecoder(input, task);
    CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
    Schema schema = task.getSchemaConfig().toSchema();
    tokenizer.nextFile();
    assertTrue(tokenizer.nextRecord());
    assertEquals("v1", tokenizer.nextColumn());
    assertEquals("v2", tokenizer.nextColumn());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v3", tokenizer.nextColumn());
    try {
        tokenizer.nextColumn();
        fail();
    } catch (Exception e) {
        assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
    }
    assertEquals("v3,\"0123", tokenizer.skipCurrentLine());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v4", tokenizer.nextColumn());
    assertEquals("v5", tokenizer.nextColumn());
    assertTrue(tokenizer.nextRecord());
    assertEquals("v6", tokenizer.nextColumn());
    assertEquals("v7", tokenizer.nextColumn());
}
Also used : Schema(org.embulk.spi.Schema) ListFileInput(org.embulk.spi.util.ListFileInput) FileInput(org.embulk.spi.FileInput) LineDecoder(org.embulk.spi.util.LineDecoder) Test(org.junit.Test)

Example 3 with LineDecoder

use of org.embulk.spi.util.LineDecoder in project embulk by embulk.

the class TestCsvTokenizer method parse.

private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input) {
    LineDecoder decoder = new LineDecoder(input, task);
    CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
    Schema schema = task.getSchemaConfig().toSchema();
    tokenizer.nextFile();
    List<List<String>> records = new ArrayList<>();
    while (tokenizer.nextRecord()) {
        List<String> record = new ArrayList<>();
        for (Column c : schema.getColumns()) {
            String v = tokenizer.nextColumnOrNull();
            record.add(v);
        }
        records.add(record);
    }
    return records;
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) LineDecoder(org.embulk.spi.util.LineDecoder)

Aggregations

LineDecoder (org.embulk.spi.util.LineDecoder)3 Column (org.embulk.spi.Column)2 Schema (org.embulk.spi.Schema)2 ImmutableList (com.google.common.collect.ImmutableList)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ColumnVisitor (org.embulk.spi.ColumnVisitor)1 DataException (org.embulk.spi.DataException)1 FileInput (org.embulk.spi.FileInput)1 PageBuilder (org.embulk.spi.PageBuilder)1 JsonParseException (org.embulk.spi.json.JsonParseException)1 JsonParser (org.embulk.spi.json.JsonParser)1 TimestampParseException (org.embulk.spi.time.TimestampParseException)1 TimestampParser (org.embulk.spi.time.TimestampParser)1 ListFileInput (org.embulk.spi.util.ListFileInput)1 Test (org.junit.Test)1