use of org.apache.hadoop.io.Text in project hadoop by apache.
the class TestDataJoin method writeSimpleSrc.
private static Path[] writeSimpleSrc(Path testdir, JobConf conf, int srcs) throws IOException {
SequenceFile.Writer[] out = null;
Path[] src = new Path[srcs];
try {
out = createWriters(testdir, conf, srcs, src);
final int capacity = srcs * 2 + 1;
Text key = new Text();
key.set("ignored");
Text val = new Text();
for (int k = 0; k < capacity; ++k) {
for (int i = 0; i < srcs; ++i) {
val.set(Integer.toString(k % srcs == 0 ? k * srcs : k * srcs + i) + "\t" + Integer.toString(10 * k + i));
out[i].append(key, val);
if (i == k) {
// add duplicate key
out[i].append(key, val);
}
}
}
} finally {
if (out != null) {
for (int i = 0; i < srcs; ++i) {
if (out[i] != null)
out[i].close();
}
}
}
return src;
}
use of org.apache.hadoop.io.Text in project hadoop by apache.
the class TestDataJoin method confirmOutput.
private static void confirmOutput(Path out, JobConf job, int srcs) throws IOException {
FileSystem fs = out.getFileSystem(job);
FileStatus[] outlist = fs.listStatus(out);
assertEquals(1, outlist.length);
assertTrue(0 < outlist[0].getLen());
FSDataInputStream in = fs.open(outlist[0].getPath());
LineRecordReader rr = new LineRecordReader(in, 0, Integer.MAX_VALUE, job);
LongWritable k = new LongWritable();
Text v = new Text();
int count = 0;
while (rr.next(k, v)) {
String[] vals = v.toString().split("\t");
assertEquals(srcs + 1, vals.length);
int[] ivals = new int[vals.length];
for (int i = 0; i < vals.length; ++i) ivals[i] = Integer.parseInt(vals[i]);
assertEquals(0, ivals[0] % (srcs * srcs));
for (int i = 1; i < vals.length; ++i) {
assertEquals((ivals[i] - (i - 1)) * srcs, 10 * ivals[0]);
}
++count;
}
assertEquals(4, count);
}
use of org.apache.hadoop.io.Text in project hadoop by apache.
the class CopyListing method validateFinalListing.
/**
* Validate the final resulting path listing. Checks if there are duplicate
* entries. If preserving ACLs, checks that file system can support ACLs.
* If preserving XAttrs, checks that file system can support XAttrs.
*
* @param pathToListFile - path listing build by doBuildListing
* @param options - Input options to distcp
* @throws IOException - Any issues while checking for duplicates and throws
* @throws DuplicateFileException - if there are duplicates
*/
private void validateFinalListing(Path pathToListFile, DistCpOptions options) throws DuplicateFileException, IOException {
Configuration config = getConf();
FileSystem fs = pathToListFile.getFileSystem(config);
Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(sortedList));
try {
//source relative path can never hold *
Text lastKey = new Text("*");
CopyListingFileStatus lastFileStatus = new CopyListingFileStatus();
Text currentKey = new Text();
Set<URI> aclSupportCheckFsSet = Sets.newHashSet();
Set<URI> xAttrSupportCheckFsSet = Sets.newHashSet();
long idx = 0;
while (reader.next(currentKey)) {
if (currentKey.equals(lastKey)) {
CopyListingFileStatus currentFileStatus = new CopyListingFileStatus();
reader.getCurrentValue(currentFileStatus);
throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " + currentFileStatus.getPath() + " would cause duplicates. Aborting");
}
reader.getCurrentValue(lastFileStatus);
if (options.shouldPreserve(DistCpOptions.FileAttribute.ACL)) {
FileSystem lastFs = lastFileStatus.getPath().getFileSystem(config);
URI lastFsUri = lastFs.getUri();
if (!aclSupportCheckFsSet.contains(lastFsUri)) {
DistCpUtils.checkFileSystemAclSupport(lastFs);
aclSupportCheckFsSet.add(lastFsUri);
}
}
if (options.shouldPreserve(DistCpOptions.FileAttribute.XATTR)) {
FileSystem lastFs = lastFileStatus.getPath().getFileSystem(config);
URI lastFsUri = lastFs.getUri();
if (!xAttrSupportCheckFsSet.contains(lastFsUri)) {
DistCpUtils.checkFileSystemXAttrSupport(lastFs);
xAttrSupportCheckFsSet.add(lastFsUri);
}
}
lastKey.set(currentKey);
if (options.shouldUseDiff() && LOG.isDebugEnabled()) {
LOG.debug("Copy list entry " + idx + ": " + lastFileStatus.getPath().toUri().getPath());
idx++;
}
}
} finally {
IOUtils.closeStream(reader);
}
}
use of org.apache.hadoop.io.Text in project hadoop by apache.
the class DynamicInputFormat method splitCopyListingIntoChunksWithShuffle.
private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle(JobContext context) throws IOException {
final Configuration configuration = context.getConfiguration();
int numRecords = getNumberOfRecords(configuration);
int numMaps = getNumMapTasks(configuration);
int maxChunksTolerable = getMaxChunksTolerable(configuration);
// Number of chunks each map will process, on average.
int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
validateNumChunksUsing(splitRatio, numMaps, maxChunksTolerable);
int numEntriesPerChunk = (int) Math.ceil((float) numRecords / (splitRatio * numMaps));
DistCpUtils.publish(context.getConfiguration(), CONF_LABEL_NUM_ENTRIES_PER_CHUNK, numEntriesPerChunk);
final int nChunksTotal = (int) Math.ceil((float) numRecords / numEntriesPerChunk);
int nChunksOpenAtOnce = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
Path listingPath = getListingFilePath(configuration);
SequenceFile.Reader reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.file(listingPath));
List<DynamicInputChunk> openChunks = new ArrayList<DynamicInputChunk>();
List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
CopyListingFileStatus fileStatus = new CopyListingFileStatus();
Text relPath = new Text();
int recordCounter = 0;
int chunkCount = 0;
try {
while (reader.next(relPath, fileStatus)) {
if (recordCounter % (nChunksOpenAtOnce * numEntriesPerChunk) == 0) {
// All chunks full. Create new chunk-set.
closeAll(openChunks);
chunksFinal.addAll(openChunks);
openChunks = createChunks(chunkCount, nChunksTotal, nChunksOpenAtOnce);
chunkCount += openChunks.size();
nChunksOpenAtOnce = openChunks.size();
recordCounter = 0;
}
// Shuffle into open chunks.
openChunks.get(recordCounter % nChunksOpenAtOnce).write(relPath, fileStatus);
++recordCounter;
}
} finally {
closeAll(openChunks);
chunksFinal.addAll(openChunks);
IOUtils.closeStream(reader);
}
LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
return chunksFinal;
}
use of org.apache.hadoop.io.Text in project hadoop by apache.
the class TestCopyListing method testBuildListingForSingleFile.
@Test(timeout = 10000)
public void testBuildListingForSingleFile() {
FileSystem fs = null;
String testRootString = "/singleFileListing";
Path testRoot = new Path(testRootString);
SequenceFile.Reader reader = null;
try {
fs = FileSystem.get(getConf());
if (fs.exists(testRoot))
TestDistCpUtils.delete(fs, testRootString);
Path sourceFile = new Path(testRoot, "/source/foo/bar/source.txt");
Path decoyFile = new Path(testRoot, "/target/moo/source.txt");
Path targetFile = new Path(testRoot, "/target/moo/target.txt");
TestDistCpUtils.createFile(fs, sourceFile.toString());
TestDistCpUtils.createFile(fs, decoyFile.toString());
TestDistCpUtils.createFile(fs, targetFile.toString());
List<Path> srcPaths = new ArrayList<Path>();
srcPaths.add(sourceFile);
DistCpOptions options = new DistCpOptions(srcPaths, targetFile);
CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS);
final Path listFile = new Path(testRoot, "/tmp/fileList.seq");
listing.buildListing(listFile, options);
reader = new SequenceFile.Reader(getConf(), SequenceFile.Reader.file(listFile));
CopyListingFileStatus fileStatus = new CopyListingFileStatus();
Text relativePath = new Text();
Assert.assertTrue(reader.next(relativePath, fileStatus));
Assert.assertTrue(relativePath.toString().equals(""));
} catch (Exception e) {
Assert.fail("Unexpected exception encountered.");
LOG.error("Unexpected exception: ", e);
} finally {
TestDistCpUtils.delete(fs, testRootString);
IOUtils.closeStream(reader);
}
}
Aggregations