use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.
the class JsonSequenceFileInputFormat method getRecordReader.
@Override
public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
log.info("Input file path:" + inputPathString);
Path inputPath = new Path(inputPathString);
SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf);
SequenceFile.Metadata meta = reader.getMetadata();
try {
Text keySchema = meta.get(new Text("key.schema"));
Text valueSchema = meta.get(new Text("value.schema"));
if (0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
throw new Exception();
}
// update Joboconf with schemas
conf.set("mapper.input.key.schema", keySchema.toString());
conf.set("mapper.input.value.schema", valueSchema.toString());
} catch (Exception e) {
throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
}
return super.getRecordReader(split, conf, reporter);
}
use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.
the class MysqlBuildPerformanceTest method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
if (args.length != 3)
Utils.croak("USAGE: java " + MysqlBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
String serverPropsFile = args[0];
String storeName = args[1];
String jsonDataFile = args[2];
final Store<ByteArray, byte[], byte[]> store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
final AtomicInteger obsoletes = new AtomicInteger(0);
Path jsonFilePath = new Path(jsonDataFile);
FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
PerformanceTest readWriteTest = new PerformanceTest() {
@Override
public void doOperation(int index) throws Exception {
try {
BytesWritable key = new BytesWritable();
BytesWritable value = new BytesWritable();
reader.next(key, value);
store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
} catch (ObsoleteVersionException e) {
obsoletes.incrementAndGet();
}
}
};
readWriteTest.run(1000, 1);
System.out.println("MySQl write throuhput with one thread:");
readWriteTest.printStats();
}
use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.
the class BdbBuildPerformanceTest method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
if (args.length != 3)
Utils.croak("USAGE: java " + BdbBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
String serverPropsFile = args[0];
String storeName = args[1];
String jsonDataFile = args[2];
final Store<ByteArray, byte[], byte[]> store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
final AtomicInteger obsoletes = new AtomicInteger(0);
Path jsonFilePath = new Path(jsonDataFile);
FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
PerformanceTest readWriteTest = new PerformanceTest() {
@Override
public void doOperation(int index) throws Exception {
try {
BytesWritable key = new BytesWritable();
BytesWritable value = new BytesWritable();
reader.next(key, value);
store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
} catch (ObsoleteVersionException e) {
obsoletes.incrementAndGet();
}
}
};
readWriteTest.run(30 * 1000 * 1000, 1);
System.out.println("Bdb write throuhput with one thread:");
readWriteTest.printStats();
}
use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class HudiRealtimeBootstrapBaseFileSplitConverter method recreateFileSplitWithCustomInfo.
@Override
public Optional<FileSplit> recreateFileSplitWithCustomInfo(FileSplit split, Map<String, String> customSplitInfo) throws IOException {
requireNonNull(customSplitInfo);
String customFileSplitClass = customSplitInfo.get(CUSTOM_FILE_SPLIT_CLASS_KEY);
if (!isNullOrEmpty(customFileSplitClass) && RealtimeBootstrapBaseFileSplit.class.getName().equals(customFileSplitClass)) {
String deltaFilePaths = customSplitInfo.get(DELTA_FILE_PATHS_KEY);
List<String> deltaLogPaths = isNullOrEmpty(deltaFilePaths) ? Collections.emptyList() : Arrays.asList(deltaFilePaths.split(","));
FileSplit bootstrapFileSplit = new FileSplit(new Path(customSplitInfo.get(BOOTSTRAP_FILE_SPLIT_PATH)), parseLong(customSplitInfo.get(BOOTSTRAP_FILE_SPLIT_START)), parseLong(customSplitInfo.get(BOOTSTRAP_FILE_SPLIT_LEN)), (String[]) null);
split = new RealtimeBootstrapBaseFileSplit(split, customSplitInfo.get(BASE_PATH_KEY), deltaLogPaths, customSplitInfo.get(MAX_COMMIT_TIME_KEY), bootstrapFileSplit);
return Optional.of(split);
}
return Optional.empty();
}
use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, Map<String, String> customSplitInfo) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
// Only propagate serialization schema configs by default
Predicate<String> schemaFilter = schemaProperty -> schemaProperty.startsWith("serialization.");
InputFormat<?, ?> inputFormat = getInputFormat(configuration, getInputFormatName(schema), true);
JobConf jobConf = toJobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
if (!customSplitInfo.isEmpty() && isHudiRealtimeSplit(customSplitInfo)) {
fileSplit = recreateSplitWithCustomInfo(fileSplit, customSplitInfo);
// Add additional column information for record reader
List<String> readHiveColumnNames = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getName));
jobConf.set(READ_COLUMN_NAMES_CONF_STR, Joiner.on(',').join(readHiveColumnNames));
// Remove filter when using customSplitInfo as the record reader requires complete schema configs
schemaFilter = schemaProperty -> true;
}
schema.stringPropertyNames().stream().filter(schemaFilter).forEach(name -> jobConf.set(name, schema.getProperty(name)));
// add Airlift LZO and LZOP to head of codecs list so as to not override existing entries
List<String> codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", "")));
if (!codecs.contains(LzoCodec.class.getName())) {
codecs.add(0, LzoCodec.class.getName());
}
if (!codecs.contains(LzopCodec.class.getName())) {
codecs.add(0, LzopCodec.class.getName());
}
jobConf.set("io.compression.codecs", codecs.stream().collect(joining(",")));
try {
RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
int headerCount = getHeaderCount(schema);
// Only skip header rows when the split is at the beginning of the file
if (start == 0 && headerCount > 0) {
Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue());
}
int footerCount = getFooterCount(schema);
if (footerCount > 0) {
recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf);
}
return recordReader;
} catch (IOException e) {
if (e instanceof TextLineLengthLimitExceededException) {
throw new PrestoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e);
}
}
Aggregations