use of org.hillview.storage.CsvFileWriter in project hillview by vmware.
the class SaveAsFileSketch method create.
@Override
public Empty create(@Nullable ITable data) {
Converters.checkNull(data);
try {
if (this.schema != null)
data = data.project(this.schema);
// Executed for side-effect.
data.getLoadedColumns(data.getSchema().getColumnNames());
File file = new File(this.folder);
@SuppressWarnings("unused") boolean ignored = file.mkdir();
// There is a race here: multiple workers may try to create the
// folder at the same time, so we don't bother if the creation fails.
// If the folder can't be created the writing below will fail.
String tableFile = data.getSourceFile();
if (tableFile == null)
throw new RuntimeException("I don't know how to generate file names for the data");
String baseName = Utilities.getBasename(tableFile);
String path = Paths.get(this.folder, baseName + "." + kind).toString();
HillviewLogger.instance.info("Writing data to files", "{0}", path);
ITableWriter writer;
switch(kind) {
case "orc":
writer = new OrcFileWriter(path);
break;
case "db":
writer = new CsvFileWriter(path).setWriteHeaderRow(false);
break;
case "csv":
writer = new CsvFileWriter(path);
break;
default:
throw new RuntimeException("Unknown file kind: " + kind);
}
writer.writeTable(data);
if (this.createSchema) {
String schemaFile = baseName + ".schema";
Path schemaPath = Paths.get(this.folder, schemaFile);
Schema toWrite = data.getSchema();
toWrite.writeToJsonFile(schemaPath);
Path finalSchemaPath = Paths.get(this.folder, Schema.schemaFileName);
// Attempt to atomically rename the schema; this is also a race which
// may be won by multiple participants. Hopefully all the schemas
// written should be identical, so it does not matter if this happens
// many times.
Files.move(schemaPath, finalSchemaPath, StandardCopyOption.ATOMIC_MOVE);
}
return this.zero();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.hillview.storage.CsvFileWriter in project hillview by vmware.
the class DemoDataCleaner method main.
public static void main(String[] args) throws IOException {
boolean parquetEnabled = (System.getProperty("parquet.enabled") != null);
HillviewLogger.initialize("data cleaner", "hillview.log");
String prefix = "On_Time_On_Time_Performance_";
Path folder = Paths.get(dataFolder);
Stream<Path> files = Files.walk(folder, 1);
Schema schema = Schema.readFromJsonFile(Paths.get(dataFolder, "short.schema"));
files.filter(f -> {
String filename = f.getFileName().toString();
if (!filename.contains("csv"))
return false;
// noinspection RedundantIfStatement
if (!filename.startsWith(prefix))
return false;
return true;
}).sorted(Comparator.comparing(Path::toString)).forEach(f -> {
String filename = f.toString();
CsvFileLoader.Config config = new CsvFileLoader.Config();
config.allowFewerColumns = false;
config.hasHeaderRow = true;
CsvFileLoader r = new CsvFileLoader(filename, config, new LazySchema(dataFolder + "/On_Time.schema"));
System.out.println("Reading " + f);
ITable tbl = r.load();
assert tbl != null;
if (tbl.getSchema().containsColumnName("Reporting_Airline")) {
// The schema has changed at some point
HashMap<String, String> h = new HashMap<>();
h.put("Reporting_Airline", "UniqueCarrier");
tbl = tbl.renameColumns(h);
}
ITable p = tbl.project(schema);
String end = filename.replace(prefix, "");
if (end.endsWith(".gz"))
// the output is uncompressed
end = end.replace(".gz", "");
if (!Files.exists(Paths.get(end))) {
CsvFileWriter writer = new CsvFileWriter(end);
System.out.println("Writing " + end);
writer.writeTable(p);
}
end = end.replace(".csv", ".orc");
File fend = new File(end);
if (!fend.exists()) {
OrcFileWriter owriter = new OrcFileWriter(end);
System.out.println("Writing " + end);
owriter.writeTable(p);
}
if (parquetEnabled) {
final String parquetFileName = end.replace(".orc", ".parquet");
File parquetFile = new File(parquetFileName);
if (!parquetFile.exists()) {
ParquetFileWriter writer = new ParquetFileWriter(parquetFileName);
System.out.println("Writing " + parquetFileName);
try {
writer.writeTable(p);
} catch (RuntimeException runtimeException) {
System.err.println("Error when writing to parquet file: " + runtimeException.getMessage());
// If the exception happens during writing, an incomplete file may be left
try {
Files.deleteIfExists(parquetFile.toPath());
} catch (IOException ioException) {
System.err.println("Auto Deletion failed: " + ioException.getMessage());
System.err.println("Please manually delete " + parquetFile.getPath());
System.exit(-1);
}
}
}
}
String big = filename.replace(".csv.gz", ".orc");
File fbig = new File(big);
if (!fbig.exists()) {
OrcFileWriter owriter = new OrcFileWriter(big);
System.out.println("Writing " + big);
owriter.writeTable(tbl);
}
});
files.close();
}
use of org.hillview.storage.CsvFileWriter in project hillview by vmware.
the class CsvFileTest method writeReadTable.
private void writeReadTable(ITable table) throws IOException {
UUID uid = UUID.randomUUID();
String tmpFileName = uid.toString();
String path = "./" + tmpFileName;
UUID uid1 = UUID.randomUUID();
String tmpFileName1 = uid1.toString();
Path schemaPath = Paths.get(".", tmpFileName1);
try {
CsvFileWriter writer = new CsvFileWriter(path);
writer.setWriteHeaderRow(true);
writer.writeTable(table);
table.getSchema().writeToJsonFile(schemaPath);
CsvFileLoader.Config config = new CsvFileLoader.Config();
config.allowFewerColumns = false;
config.hasHeaderRow = true;
CsvFileLoader r = new CsvFileLoader(path, config, new LazySchema(schemaPath.toString()));
ITable t = r.load();
Assert.assertNotNull(t);
String ft = table.toLongString(table.getNumOfRows());
String st = t.toLongString(t.getNumOfRows());
Assert.assertEquals(ft, st);
} finally {
if (Files.exists(Paths.get(path)))
Files.delete(Paths.get(path));
if (Files.exists(schemaPath))
Files.delete(schemaPath);
}
}
Aggregations