use of cascading.tap.hadoop.Hfs in project parquet-mr by apache.
the class ParquetScroogeSchemeTest method doRead.
private void doRead() throws Exception {
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetScroogeScheme<Name>(Name.class);
Tap source = new Hfs(sourceScheme, parquetOutputPath);
Scheme sinkScheme = new TextLine(new Fields("first", "last"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new UnpackThriftFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("0\tAlice\tPractice\n15\tBob\tHope\n24\tCharlie\tHorse\n", result);
}
use of cascading.tap.hadoop.Hfs in project parquet-mr by apache.
the class TestParquetTupleScheme method testFieldProjection.
@Test
public void testFieldProjection() throws Exception {
createFileForRead();
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetTupleScheme(new Fields("last_name"));
Tap source = new Hfs(sourceScheme, parquetInputPath);
Scheme sinkScheme = new TextLine(new Fields("last_name"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new ProjectedTupleFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("Practice\nHope\nHorse\n", result);
}
use of cascading.tap.hadoop.Hfs in project parquet-mr by apache.
the class TestParquetTupleScheme method testReadWrite.
public void testReadWrite(String inputPath) throws Exception {
createFileForRead();
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetTupleScheme(new Fields("first_name", "last_name"));
Tap source = new Hfs(sourceScheme, inputPath);
Scheme sinkScheme = new TextLine(new Fields("first", "last"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new UnpackTupleFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("Alice\tPractice\nBob\tHope\nCharlie\tHorse\n", result);
}
use of cascading.tap.hadoop.Hfs in project parquet-mr by apache.
the class ParquetTupleScheme method readSchema.
private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) {
try {
Hfs hfs;
if (tap instanceof CompositeTap)
hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
else
hfs = (Hfs) tap;
List<Footer> footers = getFooters(flowProcess, hfs);
if (footers.isEmpty()) {
throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
} else {
return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
}
} catch (IOException e) {
throw new TapException(e);
}
}
use of cascading.tap.hadoop.Hfs in project parquet-mr by apache.
the class ParquetTupleScheme method readSchema.
private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
try {
Hfs hfs;
if (tap instanceof CompositeTap)
hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
else
hfs = (Hfs) tap;
List<Footer> footers = getFooters(flowProcess, hfs);
if (footers.isEmpty()) {
throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
} else {
return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
}
} catch (IOException e) {
throw new TapException(e);
}
}
Aggregations