use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestCopyOutputFormat method testGetOutputCommitter.
@Test
public void testGetOutputCommitter() {
try {
TaskAttemptContext context = new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("200707121733", 1, TaskType.MAP, 1, 1));
context.getConfiguration().set("mapred.output.dir", "/out");
Assert.assertTrue(new CopyOutputFormat().getOutputCommitter(context) instanceof CopyCommitter);
} catch (IOException e) {
LOG.error("Exception encountered ", e);
Assert.fail("Unable to get output committer");
}
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestGridMixClasses method testLoadJobLoadRecordReader.
/*
* test LoadRecordReader. It class reads data from some files.
*/
@Test(timeout = 3000)
public void testLoadJobLoadRecordReader() throws Exception {
LoadJob.LoadRecordReader test = new LoadJob.LoadRecordReader();
Configuration conf = new Configuration();
FileSystem fs1 = mock(FileSystem.class);
when(fs1.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p1 = mock(Path.class);
when(p1.getFileSystem((JobConf) anyObject())).thenReturn(fs1);
FileSystem fs2 = mock(FileSystem.class);
when(fs2.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p2 = mock(Path.class);
when(p2.getFileSystem((JobConf) anyObject())).thenReturn(fs2);
Path[] paths = { p1, p2 };
long[] start = { 0, 0 };
long[] lengths = { 1000, 1000 };
String[] locations = { "temp1", "temp2" };
CombineFileSplit cfsplit = new CombineFileSplit(paths, start, lengths, locations);
double[] reduceBytes = { 100, 100 };
double[] reduceRecords = { 2, 2 };
long[] reduceOutputBytes = { 500, 500 };
long[] reduceOutputRecords = { 2, 2 };
ResourceUsageMetrics metrics = new ResourceUsageMetrics();
ResourceUsageMetrics[] rMetrics = { new ResourceUsageMetrics(), new ResourceUsageMetrics() };
LoadSplit input = new LoadSplit(cfsplit, 2, 3, 1500L, 2L, 3000L, 2L, reduceBytes, reduceRecords, reduceOutputBytes, reduceOutputRecords, metrics, rMetrics);
TaskAttemptID taskId = new TaskAttemptID();
TaskAttemptContext ctx = new TaskAttemptContextImpl(conf, taskId);
test.initialize(input, ctx);
GridmixRecord gr = test.getCurrentValue();
int counter = 0;
while (test.nextKeyValue()) {
gr = test.getCurrentValue();
if (counter == 0) {
// read first file
assertEquals(0.5, test.getProgress(), 0.001);
} else if (counter == 1) {
// read second file
assertEquals(1.0, test.getProgress(), 0.001);
}
//
assertEquals(1000, gr.getSize());
counter++;
}
assertEquals(1000, gr.getSize());
// Two files have been read
assertEquals(2, counter);
test.close();
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project carbondata by apache.
the class StoreCreator method executeGraph.
/**
* Execute graph which will further load data
*
* @param loadModel
* @param storeLocation
* @throws Exception
*/
public static void executeGraph(CarbonLoadModel loadModel, String storeLocation) throws Exception {
new File(storeLocation).mkdirs();
String outPutLoc = storeLocation + "/etl";
String databaseName = loadModel.getDatabaseName();
String tableName = loadModel.getTableName();
String tempLocationKey = databaseName + '_' + tableName + "_1";
CarbonProperties.getInstance().addProperty(tempLocationKey, storeLocation);
CarbonProperties.getInstance().addProperty("store_output_location", outPutLoc);
CarbonProperties.getInstance().addProperty("send.signal.load", "false");
CarbonProperties.getInstance().addProperty("carbon.is.columnar.storage", "true");
CarbonProperties.getInstance().addProperty("carbon.dimension.split.value.in.columnar", "1");
CarbonProperties.getInstance().addProperty("carbon.is.fullyfilled.bits", "true");
CarbonProperties.getInstance().addProperty("is.int.based.indexer", "true");
CarbonProperties.getInstance().addProperty("aggregate.columnar.keyblock", "true");
CarbonProperties.getInstance().addProperty("high.cardinality.value", "100000");
CarbonProperties.getInstance().addProperty("is.compressed.keyblock", "false");
CarbonProperties.getInstance().addProperty("carbon.leaf.node.size", "120000");
String graphPath = outPutLoc + File.separator + loadModel.getDatabaseName() + File.separator + tableName + File.separator + 0 + File.separator + 1 + File.separator + tableName + ".ktr";
File path = new File(graphPath);
if (path.exists()) {
path.delete();
}
SchemaInfo info = new SchemaInfo();
BlockDetails blockDetails = new BlockDetails(new Path(loadModel.getFactFilePath()), 0, new File(loadModel.getFactFilePath()).length(), new String[] { "localhost" });
Configuration configuration = new Configuration();
CSVInputFormat.setCommentCharacter(configuration, loadModel.getCommentChar());
CSVInputFormat.setCSVDelimiter(configuration, loadModel.getCsvDelimiter());
CSVInputFormat.setEscapeCharacter(configuration, loadModel.getEscapeChar());
CSVInputFormat.setHeaderExtractionEnabled(configuration, true);
CSVInputFormat.setQuoteCharacter(configuration, loadModel.getQuoteChar());
CSVInputFormat.setReadBufferSize(configuration, CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CSV_READ_BUFFER_SIZE, CarbonCommonConstants.CSV_READ_BUFFER_SIZE_DEFAULT));
CSVInputFormat.setNumberOfColumns(configuration, String.valueOf(loadModel.getCsvHeaderColumns().length));
CSVInputFormat.setMaxColumns(configuration, "10");
TaskAttemptContextImpl hadoopAttemptContext = new TaskAttemptContextImpl(configuration, new TaskAttemptID("", 1, TaskType.MAP, 0, 0));
CSVInputFormat format = new CSVInputFormat();
RecordReader<NullWritable, StringArrayWritable> recordReader = format.createRecordReader(blockDetails, hadoopAttemptContext);
CSVRecordReaderIterator readerIterator = new CSVRecordReaderIterator(recordReader, blockDetails, hadoopAttemptContext);
new DataLoadExecutor().execute(loadModel, storeLocation, new CarbonIterator[] { readerIterator });
info.setDatabaseName(databaseName);
info.setTableName(tableName);
writeLoadMetadata(loadModel.getCarbonDataLoadSchema(), loadModel.getTableName(), loadModel.getTableName(), new ArrayList<LoadMetadataDetails>());
String segLocation = storeLocation + "/" + databaseName + "/" + tableName + "/Fact/Part0/Segment_0";
File file = new File(segLocation);
File factFile = null;
File[] folderList = file.listFiles();
File folder = null;
for (int i = 0; i < folderList.length; i++) {
if (folderList[i].isDirectory()) {
folder = folderList[i];
}
}
if (folder.isDirectory()) {
File[] files = folder.listFiles();
for (int i = 0; i < files.length; i++) {
if (!files[i].isDirectory() && files[i].getName().startsWith("part")) {
factFile = files[i];
break;
}
}
// Files.copy(factFile.toPath(), file.toPath(), REPLACE_EXISTING);
factFile.renameTo(new File(segLocation + "/" + factFile.getName()));
CarbonUtil.deleteFoldersAndFiles(folder);
}
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project jena by apache.
the class AbstractBlankNodeTests method blank_node_identity_02.
/**
* Test that starts with two blank nodes in two different files and checks
* that writing them to a single file does not conflate them
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public void blank_node_identity_02() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_identity", getInitialInputExtension());
File b = File.createTempFile("bnode_identity", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Same blank node but in different files so must be treated as
// different blank nodes and not converge
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
writeTuples(a, tuples);
tuples.clear();
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(b, tuples);
// Set up fake job which will process the two files
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Prepare the output writing - putting all output to a single file
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// output
while (reader.nextKeyValue()) {
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
}
}
writer.close(outputTaskContext);
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// The Blank nodes should have been given separate identities so we
// should not be conflating them, this is the opposite problem to
// that described in JENA-820
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes must not diverge
Assert.assertEquals(2, nodes.size());
} finally {
a.delete();
b.delete();
deleteDirectory(intermediateOutputDir);
}
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project jena by apache.
the class AbstractBlankNodeTests method blank_node_divergence_01.
/**
* Test that starts with two blank nodes with the same identity in a single
* file, splits them over two files and checks that we can workaround
* JENA-820 successfully by setting the
* {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Two mentions of the same blank node in the same file
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(a, tuples);
// Set up fake job which will process the file as a single split
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// Copy the input to the output - each triple goes to a separate
// output file
// This is how we force multiple files to be produced
int taskID = 1;
while (reader.nextKeyValue()) {
// Prepare the output writing
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
writer.close(outputTaskContext);
}
}
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// As described in JENA-820 at this point the blank nodes are
// consistent, however when we read them from different files they
// by default get treated as different nodes and so the blank nodes
// diverge which is incorrect and undesirable behaviour in
// multi-stage pipelines
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
// Enabling this flag works around the JENA-820 issue
job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes should not have diverged
Assert.assertEquals(1, nodes.size());
} finally {
a.delete();
deleteDirectory(intermediateOutputDir);
}
}
Aggregations