Search in sources :

Example 96 with Text

use of org.apache.hadoop.io.Text in project hadoop by apache.

the class TestDataJoin method writeSimpleSrc.

private static Path[] writeSimpleSrc(Path testdir, JobConf conf, int srcs) throws IOException {
    SequenceFile.Writer[] out = null;
    Path[] src = new Path[srcs];
    try {
        out = createWriters(testdir, conf, srcs, src);
        final int capacity = srcs * 2 + 1;
        Text key = new Text();
        key.set("ignored");
        Text val = new Text();
        for (int k = 0; k < capacity; ++k) {
            for (int i = 0; i < srcs; ++i) {
                val.set(Integer.toString(k % srcs == 0 ? k * srcs : k * srcs + i) + "\t" + Integer.toString(10 * k + i));
                out[i].append(key, val);
                if (i == k) {
                    // add duplicate key
                    out[i].append(key, val);
                }
            }
        }
    } finally {
        if (out != null) {
            for (int i = 0; i < srcs; ++i) {
                if (out[i] != null)
                    out[i].close();
            }
        }
    }
    return src;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text)

Example 97 with Text

use of org.apache.hadoop.io.Text in project hadoop by apache.

the class TestDataJoin method confirmOutput.

private static void confirmOutput(Path out, JobConf job, int srcs) throws IOException {
    FileSystem fs = out.getFileSystem(job);
    FileStatus[] outlist = fs.listStatus(out);
    assertEquals(1, outlist.length);
    assertTrue(0 < outlist[0].getLen());
    FSDataInputStream in = fs.open(outlist[0].getPath());
    LineRecordReader rr = new LineRecordReader(in, 0, Integer.MAX_VALUE, job);
    LongWritable k = new LongWritable();
    Text v = new Text();
    int count = 0;
    while (rr.next(k, v)) {
        String[] vals = v.toString().split("\t");
        assertEquals(srcs + 1, vals.length);
        int[] ivals = new int[vals.length];
        for (int i = 0; i < vals.length; ++i) ivals[i] = Integer.parseInt(vals[i]);
        assertEquals(0, ivals[0] % (srcs * srcs));
        for (int i = 1; i < vals.length; ++i) {
            assertEquals((ivals[i] - (i - 1)) * srcs, 10 * ivals[0]);
        }
        ++count;
    }
    assertEquals(4, count);
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable)

Example 98 with Text

use of org.apache.hadoop.io.Text in project hadoop by apache.

the class CopyListing method validateFinalListing.

/**
   * Validate the final resulting path listing.  Checks if there are duplicate
   * entries.  If preserving ACLs, checks that file system can support ACLs.
   * If preserving XAttrs, checks that file system can support XAttrs.
   *
   * @param pathToListFile - path listing build by doBuildListing
   * @param options - Input options to distcp
   * @throws IOException - Any issues while checking for duplicates and throws
   * @throws DuplicateFileException - if there are duplicates
   */
private void validateFinalListing(Path pathToListFile, DistCpOptions options) throws DuplicateFileException, IOException {
    Configuration config = getConf();
    FileSystem fs = pathToListFile.getFileSystem(config);
    Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
    SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(sortedList));
    try {
        //source relative path can never hold *
        Text lastKey = new Text("*");
        CopyListingFileStatus lastFileStatus = new CopyListingFileStatus();
        Text currentKey = new Text();
        Set<URI> aclSupportCheckFsSet = Sets.newHashSet();
        Set<URI> xAttrSupportCheckFsSet = Sets.newHashSet();
        long idx = 0;
        while (reader.next(currentKey)) {
            if (currentKey.equals(lastKey)) {
                CopyListingFileStatus currentFileStatus = new CopyListingFileStatus();
                reader.getCurrentValue(currentFileStatus);
                throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " + currentFileStatus.getPath() + " would cause duplicates. Aborting");
            }
            reader.getCurrentValue(lastFileStatus);
            if (options.shouldPreserve(DistCpOptions.FileAttribute.ACL)) {
                FileSystem lastFs = lastFileStatus.getPath().getFileSystem(config);
                URI lastFsUri = lastFs.getUri();
                if (!aclSupportCheckFsSet.contains(lastFsUri)) {
                    DistCpUtils.checkFileSystemAclSupport(lastFs);
                    aclSupportCheckFsSet.add(lastFsUri);
                }
            }
            if (options.shouldPreserve(DistCpOptions.FileAttribute.XATTR)) {
                FileSystem lastFs = lastFileStatus.getPath().getFileSystem(config);
                URI lastFsUri = lastFs.getUri();
                if (!xAttrSupportCheckFsSet.contains(lastFsUri)) {
                    DistCpUtils.checkFileSystemXAttrSupport(lastFs);
                    xAttrSupportCheckFsSet.add(lastFsUri);
                }
            }
            lastKey.set(currentKey);
            if (options.shouldUseDiff() && LOG.isDebugEnabled()) {
                LOG.debug("Copy list entry " + idx + ": " + lastFileStatus.getPath().toUri().getPath());
                idx++;
            }
        }
    } finally {
        IOUtils.closeStream(reader);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) URI(java.net.URI)

Example 99 with Text

use of org.apache.hadoop.io.Text in project hadoop by apache.

the class DynamicInputFormat method splitCopyListingIntoChunksWithShuffle.

private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle(JobContext context) throws IOException {
    final Configuration configuration = context.getConfiguration();
    int numRecords = getNumberOfRecords(configuration);
    int numMaps = getNumMapTasks(configuration);
    int maxChunksTolerable = getMaxChunksTolerable(configuration);
    // Number of chunks each map will process, on average.
    int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
    validateNumChunksUsing(splitRatio, numMaps, maxChunksTolerable);
    int numEntriesPerChunk = (int) Math.ceil((float) numRecords / (splitRatio * numMaps));
    DistCpUtils.publish(context.getConfiguration(), CONF_LABEL_NUM_ENTRIES_PER_CHUNK, numEntriesPerChunk);
    final int nChunksTotal = (int) Math.ceil((float) numRecords / numEntriesPerChunk);
    int nChunksOpenAtOnce = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
    Path listingPath = getListingFilePath(configuration);
    SequenceFile.Reader reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.file(listingPath));
    List<DynamicInputChunk> openChunks = new ArrayList<DynamicInputChunk>();
    List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
    CopyListingFileStatus fileStatus = new CopyListingFileStatus();
    Text relPath = new Text();
    int recordCounter = 0;
    int chunkCount = 0;
    try {
        while (reader.next(relPath, fileStatus)) {
            if (recordCounter % (nChunksOpenAtOnce * numEntriesPerChunk) == 0) {
                // All chunks full. Create new chunk-set.
                closeAll(openChunks);
                chunksFinal.addAll(openChunks);
                openChunks = createChunks(chunkCount, nChunksTotal, nChunksOpenAtOnce);
                chunkCount += openChunks.size();
                nChunksOpenAtOnce = openChunks.size();
                recordCounter = 0;
            }
            // Shuffle into open chunks.
            openChunks.get(recordCounter % nChunksOpenAtOnce).write(relPath, fileStatus);
            ++recordCounter;
        }
    } finally {
        closeAll(openChunks);
        chunksFinal.addAll(openChunks);
        IOUtils.closeStream(reader);
    }
    LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
    return chunksFinal;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text)

Example 100 with Text

use of org.apache.hadoop.io.Text in project hadoop by apache.

the class TestCopyListing method testBuildListingForSingleFile.

@Test(timeout = 10000)
public void testBuildListingForSingleFile() {
    FileSystem fs = null;
    String testRootString = "/singleFileListing";
    Path testRoot = new Path(testRootString);
    SequenceFile.Reader reader = null;
    try {
        fs = FileSystem.get(getConf());
        if (fs.exists(testRoot))
            TestDistCpUtils.delete(fs, testRootString);
        Path sourceFile = new Path(testRoot, "/source/foo/bar/source.txt");
        Path decoyFile = new Path(testRoot, "/target/moo/source.txt");
        Path targetFile = new Path(testRoot, "/target/moo/target.txt");
        TestDistCpUtils.createFile(fs, sourceFile.toString());
        TestDistCpUtils.createFile(fs, decoyFile.toString());
        TestDistCpUtils.createFile(fs, targetFile.toString());
        List<Path> srcPaths = new ArrayList<Path>();
        srcPaths.add(sourceFile);
        DistCpOptions options = new DistCpOptions(srcPaths, targetFile);
        CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS);
        final Path listFile = new Path(testRoot, "/tmp/fileList.seq");
        listing.buildListing(listFile, options);
        reader = new SequenceFile.Reader(getConf(), SequenceFile.Reader.file(listFile));
        CopyListingFileStatus fileStatus = new CopyListingFileStatus();
        Text relativePath = new Text();
        Assert.assertTrue(reader.next(relativePath, fileStatus));
        Assert.assertTrue(relativePath.toString().equals(""));
    } catch (Exception e) {
        Assert.fail("Unexpected exception encountered.");
        LOG.error("Unexpected exception: ", e);
    } finally {
        TestDistCpUtils.delete(fs, testRootString);
        IOUtils.closeStream(reader);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.Test)

Aggregations

Text (org.apache.hadoop.io.Text)1012 Test (org.junit.Test)397 Path (org.apache.hadoop.fs.Path)180 Configuration (org.apache.hadoop.conf.Configuration)169 LongWritable (org.apache.hadoop.io.LongWritable)141 IOException (java.io.IOException)139 IntWritable (org.apache.hadoop.io.IntWritable)115 FileSystem (org.apache.hadoop.fs.FileSystem)109 ArrayList (java.util.ArrayList)100 Token (org.apache.hadoop.security.token.Token)94 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)86 BytesWritable (org.apache.hadoop.io.BytesWritable)73 SequenceFile (org.apache.hadoop.io.SequenceFile)68 Credentials (org.apache.hadoop.security.Credentials)63 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)54 DeferredJavaObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject)53 JobConf (org.apache.hadoop.mapred.JobConf)50 FloatWritable (org.apache.hadoop.io.FloatWritable)46 BooleanWritable (org.apache.hadoop.io.BooleanWritable)45 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)42