use of org.apache.hadoop.io.LongWritable in project hadoop by apache.
the class RegexMapper method map.
public void map(K key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
String text = value.toString();
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
output.collect(new Text(matcher.group(group)), new LongWritable(1));
}
}
use of org.apache.hadoop.io.LongWritable in project hadoop by apache.
the class TestMultipleOutputs method _testMultipleOutputs.
protected void _testMultipleOutputs(boolean withCounters) throws Exception {
Path inDir = getDir(IN_DIR);
Path outDir = getDir(OUT_DIR);
JobConf conf = createJobConf();
FileSystem fs = FileSystem.get(conf);
DataOutputStream file = fs.create(new Path(inDir, "part-0"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
file = fs.create(new Path(inDir, "part-1"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
conf.setJobName("mo");
conf.setInputFormat(TextInputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapOutputKeyClass(LongWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputFormat(TextOutputFormat.class);
MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, LongWritable.class, Text.class);
MultipleOutputs.addMultiNamedOutput(conf, "sequence", SequenceFileOutputFormat.class, LongWritable.class, Text.class);
MultipleOutputs.setCountersEnabled(conf, withCounters);
conf.setMapperClass(MOMap.class);
conf.setReducerClass(MOReduce.class);
FileInputFormat.setInputPaths(conf, inDir);
FileOutputFormat.setOutputPath(conf, outDir);
JobClient jc = new JobClient(conf);
RunningJob job = jc.submitJob(conf);
while (!job.isComplete()) {
Thread.sleep(100);
}
// assert number of named output part files
int namedOutputCount = 0;
FileStatus[] statuses = fs.listStatus(outDir);
for (FileStatus status : statuses) {
if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-m-00001") || status.getPath().getName().equals("text-r-00000") || status.getPath().getName().equals("sequence_A-m-00000") || status.getPath().getName().equals("sequence_A-m-00001") || status.getPath().getName().equals("sequence_B-m-00000") || status.getPath().getName().equals("sequence_B-m-00001") || status.getPath().getName().equals("sequence_B-r-00000") || status.getPath().getName().equals("sequence_C-r-00000")) {
namedOutputCount++;
}
}
assertEquals(9, namedOutputCount);
// assert TextOutputFormat files correctness
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
int count = 0;
String line = reader.readLine();
while (line != null) {
assertTrue(line.endsWith("text"));
line = reader.readLine();
count++;
}
reader.close();
assertFalse(count == 0);
// assert SequenceOutputFormat files correctness
SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, new Path(FileOutputFormat.getOutputPath(conf), "sequence_B-r-00000"), conf);
assertEquals(LongWritable.class, seqReader.getKeyClass());
assertEquals(Text.class, seqReader.getValueClass());
count = 0;
LongWritable key = new LongWritable();
Text value = new Text();
while (seqReader.next(key, value)) {
assertEquals("sequence", value.toString());
count++;
}
seqReader.close();
assertFalse(count == 0);
Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
if (!withCounters) {
assertEquals(0, counters.size());
} else {
assertEquals(4, counters.size());
assertEquals(4, counters.getCounter("text"));
assertEquals(2, counters.getCounter("sequence_A"));
assertEquals(4, counters.getCounter("sequence_B"));
assertEquals(2, counters.getCounter("sequence_C"));
}
}
use of org.apache.hadoop.io.LongWritable in project hadoop by apache.
the class BlockSender method sendPacket.
/**
* Sends a packet with up to maxChunks chunks of data.
*
* @param pkt buffer used for writing packet data
* @param maxChunks maximum number of chunks to send
* @param out stream to send data to
* @param transferTo use transferTo to send data
* @param throttler used for throttling data transfer bandwidth
*/
private int sendPacket(ByteBuffer pkt, int maxChunks, OutputStream out, boolean transferTo, DataTransferThrottler throttler) throws IOException {
int dataLen = (int) Math.min(endOffset - offset, (chunkSize * (long) maxChunks));
// Number of chunks be sent in the packet
int numChunks = numberOfChunks(dataLen);
int checksumDataLen = numChunks * checksumSize;
int packetLen = dataLen + checksumDataLen + 4;
boolean lastDataPacket = offset + dataLen == endOffset && dataLen > 0;
// The packet buffer is organized as follows:
// _______HHHHCCCCD?D?D?D?
// ^ ^
// | \ checksumOff
// \ headerOff
// _ padding, since the header is variable-length
// H = header and length prefixes
// C = checksums
// D? = data, if transferTo is false.
int headerLen = writePacketHeader(pkt, dataLen, packetLen);
// Per above, the header doesn't start at the beginning of the
// buffer
int headerOff = pkt.position() - headerLen;
int checksumOff = pkt.position();
byte[] buf = pkt.array();
if (checksumSize > 0 && ris.getChecksumIn() != null) {
readChecksum(buf, checksumOff, checksumDataLen);
// write in progress that we need to use to get last checksum
if (lastDataPacket && lastChunkChecksum != null) {
int start = checksumOff + checksumDataLen - checksumSize;
byte[] updatedChecksum = lastChunkChecksum.getChecksum();
if (updatedChecksum != null) {
System.arraycopy(updatedChecksum, 0, buf, start, checksumSize);
}
}
}
int dataOff = checksumOff + checksumDataLen;
if (!transferTo) {
// normal transfer
ris.readDataFully(buf, dataOff, dataLen);
if (verifyChecksum) {
verifyChecksum(buf, dataOff, dataLen, numChunks, checksumOff);
}
}
try {
if (transferTo) {
SocketOutputStream sockOut = (SocketOutputStream) out;
// First write header and checksums
sockOut.write(buf, headerOff, dataOff - headerOff);
// no need to flush since we know out is not a buffered stream
FileChannel fileCh = ((FileInputStream) ris.getDataIn()).getChannel();
LongWritable waitTime = new LongWritable();
LongWritable transferTime = new LongWritable();
fileIoProvider.transferToSocketFully(ris.getVolumeRef().getVolume(), sockOut, fileCh, blockInPosition, dataLen, waitTime, transferTime);
datanode.metrics.addSendDataPacketBlockedOnNetworkNanos(waitTime.get());
datanode.metrics.addSendDataPacketTransferNanos(transferTime.get());
blockInPosition += dataLen;
} else {
// normal transfer
out.write(buf, headerOff, dataOff + dataLen - headerOff);
}
} catch (IOException e) {
if (e instanceof SocketTimeoutException) {
/*
* writing to client timed out. This happens if the client reads
* part of a block and then decides not to read the rest (but leaves
* the socket open).
*
* Reporting of this case is done in DataXceiver#run
*/
} else {
/* Exception while writing to the client. Connection closure from
* the other end is mostly the case and we do not care much about
* it. But other things can go wrong, especially in transferTo(),
* which we do not want to ignore.
*
* The message parsing below should not be considered as a good
* coding example. NEVER do it to drive a program logic. NEVER.
* It was done here because the NIO throws an IOException for EPIPE.
*/
String ioem = e.getMessage();
if (!ioem.startsWith("Broken pipe") && !ioem.startsWith("Connection reset")) {
LOG.error("BlockSender.sendChunks() exception: ", e);
datanode.getBlockScanner().markSuspectBlock(ris.getVolumeRef().getVolume().getStorageID(), block);
}
}
throw ioeToSocketException(e);
}
if (throttler != null) {
// rebalancing so throttle
throttler.throttle(packetLen);
}
return dataLen;
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class DataTransform method spDataTransform.
public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
JobConf job = new JobConf();
FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names and construct output header
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf = 0, numColumnsTf = 0;
JavaPairRDD<Long, String> tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
if (outtfPairRDD != null) {
MatrixObject outMO = outputs[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
*
* @param variableName
* name of the variable associated with the frame
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringCSVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd, variableName));
return frameObject;
}
Aggregations