Search in sources :

Example 1 with CsvFileWriter

use of org.hillview.storage.CsvFileWriter in project hillview by vmware.

the class SaveAsFileSketch method create.

@Override
public Empty create(@Nullable ITable data) {
    Converters.checkNull(data);
    try {
        if (this.schema != null)
            data = data.project(this.schema);
        // Executed for side-effect.
        data.getLoadedColumns(data.getSchema().getColumnNames());
        File file = new File(this.folder);
        @SuppressWarnings("unused") boolean ignored = file.mkdir();
        // There is a race here: multiple workers may try to create the
        // folder at the same time, so we don't bother if the creation fails.
        // If the folder can't be created the writing below will fail.
        String tableFile = data.getSourceFile();
        if (tableFile == null)
            throw new RuntimeException("I don't know how to generate file names for the data");
        String baseName = Utilities.getBasename(tableFile);
        String path = Paths.get(this.folder, baseName + "." + kind).toString();
        HillviewLogger.instance.info("Writing data to files", "{0}", path);
        ITableWriter writer;
        switch(kind) {
            case "orc":
                writer = new OrcFileWriter(path);
                break;
            case "db":
                writer = new CsvFileWriter(path).setWriteHeaderRow(false);
                break;
            case "csv":
                writer = new CsvFileWriter(path);
                break;
            default:
                throw new RuntimeException("Unknown file kind: " + kind);
        }
        writer.writeTable(data);
        if (this.createSchema) {
            String schemaFile = baseName + ".schema";
            Path schemaPath = Paths.get(this.folder, schemaFile);
            Schema toWrite = data.getSchema();
            toWrite.writeToJsonFile(schemaPath);
            Path finalSchemaPath = Paths.get(this.folder, Schema.schemaFileName);
            // Attempt to atomically rename the schema; this is also a race which
            // may be won by multiple participants.  Hopefully all the schemas
            // written should be identical, so it does not matter if this happens
            // many times.
            Files.move(schemaPath, finalSchemaPath, StandardCopyOption.ATOMIC_MOVE);
        }
        return this.zero();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(java.nio.file.Path) ITableWriter(org.hillview.storage.ITableWriter) Schema(org.hillview.table.Schema) OrcFileWriter(org.hillview.storage.OrcFileWriter) IOException(java.io.IOException) File(java.io.File) CsvFileWriter(org.hillview.storage.CsvFileWriter)

Example 2 with CsvFileWriter

use of org.hillview.storage.CsvFileWriter in project hillview by vmware.

the class DemoDataCleaner method main.

public static void main(String[] args) throws IOException {
    boolean parquetEnabled = (System.getProperty("parquet.enabled") != null);
    HillviewLogger.initialize("data cleaner", "hillview.log");
    String prefix = "On_Time_On_Time_Performance_";
    Path folder = Paths.get(dataFolder);
    Stream<Path> files = Files.walk(folder, 1);
    Schema schema = Schema.readFromJsonFile(Paths.get(dataFolder, "short.schema"));
    files.filter(f -> {
        String filename = f.getFileName().toString();
        if (!filename.contains("csv"))
            return false;
        // noinspection RedundantIfStatement
        if (!filename.startsWith(prefix))
            return false;
        return true;
    }).sorted(Comparator.comparing(Path::toString)).forEach(f -> {
        String filename = f.toString();
        CsvFileLoader.Config config = new CsvFileLoader.Config();
        config.allowFewerColumns = false;
        config.hasHeaderRow = true;
        CsvFileLoader r = new CsvFileLoader(filename, config, new LazySchema(dataFolder + "/On_Time.schema"));
        System.out.println("Reading " + f);
        ITable tbl = r.load();
        assert tbl != null;
        if (tbl.getSchema().containsColumnName("Reporting_Airline")) {
            // The schema has changed at some point
            HashMap<String, String> h = new HashMap<>();
            h.put("Reporting_Airline", "UniqueCarrier");
            tbl = tbl.renameColumns(h);
        }
        ITable p = tbl.project(schema);
        String end = filename.replace(prefix, "");
        if (end.endsWith(".gz"))
            // the output is uncompressed
            end = end.replace(".gz", "");
        if (!Files.exists(Paths.get(end))) {
            CsvFileWriter writer = new CsvFileWriter(end);
            System.out.println("Writing " + end);
            writer.writeTable(p);
        }
        end = end.replace(".csv", ".orc");
        File fend = new File(end);
        if (!fend.exists()) {
            OrcFileWriter owriter = new OrcFileWriter(end);
            System.out.println("Writing " + end);
            owriter.writeTable(p);
        }
        if (parquetEnabled) {
            final String parquetFileName = end.replace(".orc", ".parquet");
            File parquetFile = new File(parquetFileName);
            if (!parquetFile.exists()) {
                ParquetFileWriter writer = new ParquetFileWriter(parquetFileName);
                System.out.println("Writing " + parquetFileName);
                try {
                    writer.writeTable(p);
                } catch (RuntimeException runtimeException) {
                    System.err.println("Error when writing to parquet file: " + runtimeException.getMessage());
                    // If the exception happens during writing, an incomplete file may be left
                    try {
                        Files.deleteIfExists(parquetFile.toPath());
                    } catch (IOException ioException) {
                        System.err.println("Auto Deletion failed: " + ioException.getMessage());
                        System.err.println("Please manually delete " + parquetFile.getPath());
                        System.exit(-1);
                    }
                }
            }
        }
        String big = filename.replace(".csv.gz", ".orc");
        File fbig = new File(big);
        if (!fbig.exists()) {
            OrcFileWriter owriter = new OrcFileWriter(big);
            System.out.println("Writing " + big);
            owriter.writeTable(tbl);
        }
    });
    files.close();
}
Also used : Path(java.nio.file.Path) CsvFileLoader(org.hillview.storage.CsvFileLoader) HashMap(java.util.HashMap) LazySchema(org.hillview.table.LazySchema) Schema(org.hillview.table.Schema) OrcFileWriter(org.hillview.storage.OrcFileWriter) IOException(java.io.IOException) ParquetFileWriter(org.hillview.storage.ParquetFileWriter) LazySchema(org.hillview.table.LazySchema) ITable(org.hillview.table.api.ITable) File(java.io.File) CsvFileWriter(org.hillview.storage.CsvFileWriter)

Example 3 with CsvFileWriter

use of org.hillview.storage.CsvFileWriter in project hillview by vmware.

the class CsvFileTest method writeReadTable.

private void writeReadTable(ITable table) throws IOException {
    UUID uid = UUID.randomUUID();
    String tmpFileName = uid.toString();
    String path = "./" + tmpFileName;
    UUID uid1 = UUID.randomUUID();
    String tmpFileName1 = uid1.toString();
    Path schemaPath = Paths.get(".", tmpFileName1);
    try {
        CsvFileWriter writer = new CsvFileWriter(path);
        writer.setWriteHeaderRow(true);
        writer.writeTable(table);
        table.getSchema().writeToJsonFile(schemaPath);
        CsvFileLoader.Config config = new CsvFileLoader.Config();
        config.allowFewerColumns = false;
        config.hasHeaderRow = true;
        CsvFileLoader r = new CsvFileLoader(path, config, new LazySchema(schemaPath.toString()));
        ITable t = r.load();
        Assert.assertNotNull(t);
        String ft = table.toLongString(table.getNumOfRows());
        String st = t.toLongString(t.getNumOfRows());
        Assert.assertEquals(ft, st);
    } finally {
        if (Files.exists(Paths.get(path)))
            Files.delete(Paths.get(path));
        if (Files.exists(schemaPath))
            Files.delete(schemaPath);
    }
}
Also used : Path(java.nio.file.Path) CsvFileLoader(org.hillview.storage.CsvFileLoader) LazySchema(org.hillview.table.LazySchema) ITable(org.hillview.table.api.ITable) UUID(java.util.UUID) CsvFileWriter(org.hillview.storage.CsvFileWriter)

Aggregations

Path (java.nio.file.Path)3 CsvFileWriter (org.hillview.storage.CsvFileWriter)3 File (java.io.File)2 IOException (java.io.IOException)2 CsvFileLoader (org.hillview.storage.CsvFileLoader)2 OrcFileWriter (org.hillview.storage.OrcFileWriter)2 LazySchema (org.hillview.table.LazySchema)2 Schema (org.hillview.table.Schema)2 ITable (org.hillview.table.api.ITable)2 HashMap (java.util.HashMap)1 UUID (java.util.UUID)1 ITableWriter (org.hillview.storage.ITableWriter)1 ParquetFileWriter (org.hillview.storage.ParquetFileWriter)1