use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputSplitTaskInfo method getLocations.
@Override
public String[] getLocations() {
Set<String> locations = new HashSet<String>();
for (CarbonInputSplit splitInfo : carbonBlockInfoList) {
try {
locations.addAll(Arrays.asList(splitInfo.getLocations()));
} catch (IOException e) {
throw new RuntimeException("Fail to get location of split: " + splitInfo, e);
}
}
locations.toArray(new String[locations.size()]);
List<String> nodes = CarbonInputSplitTaskInfo.maxNoNodes(carbonBlockInfoList);
return nodes.toArray(new String[nodes.size()]);
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonStreamInputFormatTest method buildInputSplit.
private InputSplit buildInputSplit() throws IOException {
CarbonInputSplit carbonInputSplit = new CarbonInputSplit();
List<CarbonInputSplit> splitList = new ArrayList<>();
splitList.add(carbonInputSplit);
return new CarbonMultiBlockSplit(splitList, new String[] { "localhost" }, FileFormat.ROW_V1);
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputSplitTaskInfo method maxNoNodes.
/**
* Finding which node has the maximum number of blocks for it.
*
* @param splitList
* @return
*/
public static List<String> maxNoNodes(List<CarbonInputSplit> splitList) {
boolean useIndex = true;
Integer maxOccurence = 0;
String maxNode = null;
Map<String, Integer> nodeAndOccurenceMapping = new TreeMap<>();
// populate the map of node and number of occurences of that node.
for (CarbonInputSplit split : splitList) {
try {
for (String node : split.getLocations()) {
Integer nodeOccurence = nodeAndOccurenceMapping.get(node);
if (null == nodeOccurence) {
nodeAndOccurenceMapping.put(node, 1);
} else {
nodeOccurence++;
}
}
} catch (IOException e) {
throw new RuntimeException("Fail to get location of split: " + split, e);
}
}
Integer previousValueOccurence = null;
// check which node is occured maximum times.
for (Map.Entry<String, Integer> entry : nodeAndOccurenceMapping.entrySet()) {
// finding the maximum node.
if (entry.getValue() > maxOccurence) {
maxOccurence = entry.getValue();
maxNode = entry.getKey();
}
// first time scenario. initialzing the previous value.
if (null == previousValueOccurence) {
previousValueOccurence = entry.getValue();
} else {
// we need to return complete list instead of max node.
if (!Objects.equals(previousValueOccurence, entry.getValue())) {
useIndex = false;
}
}
}
// if all the nodes have equal occurence then returning the complete key set.
if (useIndex) {
return new ArrayList<>(nodeAndOccurenceMapping.keySet());
}
// if any max node is found then returning the max node.
List<String> node = new ArrayList<>(1);
node.add(maxNode);
return node;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonFileInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR, CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS
* are used to get table path to read.
*
* @return
* @throws IOException
*/
private List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver, List<Segment> validSegments, BitSet matchedPartitions, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException {
numSegments = validSegments.size();
List<InputSplit> result = new LinkedList<InputSplit>();
UpdateVO invalidBlockVOForSegmentId = null;
Boolean isIUDTable = false;
SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(carbonTable);
isIUDTable = (updateStatusManager.getUpdateStatusDetails().length != 0);
// for each segment fetch blocks matching filter in Driver BTree
List<CarbonInputSplit> dataBlocksOfSegment = getDataBlocksOfSegment(job, carbonTable, filterResolver, matchedPartitions, validSegments, partitionInfo, oldPartitionIdList);
numBlocks = dataBlocksOfSegment.size();
for (CarbonInputSplit inputSplit : dataBlocksOfSegment) {
// Get the UpdateVO for those tables on which IUD operations being performed.
if (isIUDTable) {
invalidBlockVOForSegmentId = updateStatusManager.getInvalidTimestampRange(inputSplit.getSegmentId());
}
String[] deleteDeltaFilePath = null;
if (isIUDTable) {
// invalidated blocks.
if (CarbonUtil.isInvalidTableBlock(inputSplit.getSegmentId(), inputSplit.getPath().toString(), invalidBlockVOForSegmentId, updateStatusManager)) {
continue;
}
// When iud is done then only get delete delta files for a block
try {
deleteDeltaFilePath = updateStatusManager.getDeleteDeltaFilePath(inputSplit.getPath().toString(), inputSplit.getSegmentId());
} catch (Exception e) {
throw new IOException(e);
}
}
inputSplit.setDeleteDeltaFiles(deleteDeltaFilePath);
result.add(inputSplit);
}
return result;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonFileInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR
* are used to get table path to read.
*
* @param job
* @return List<InputSplit> list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
if (getValidateSegmentsToAccess(job.getConfiguration())) {
// get all valid segments and set them into the configuration
// check for externalTable segment (Segment_null)
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
TableProvider tableProvider = new SingleTableProvider(carbonTable);
// this will be null in case of corrupt schema file.
PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getTableName());
carbonTable.processFilterExpression(filter, null, null);
FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
String segmentDir = CarbonTablePath.getSegmentPath(identifier.getTablePath(), "null");
FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
if (FileFactory.isFileExist(segmentDir, fileType)) {
// if external table Segments are found, add it to the List
List<Segment> externalTableSegments = new ArrayList<Segment>();
Segment seg = new Segment("null", null);
externalTableSegments.add(seg);
Map<String, String> indexFiles = new SegmentIndexFileStore().getIndexFilesFromSegment(segmentDir);
if (indexFiles.size() == 0) {
throw new RuntimeException("Index file not present to read the carbondata file");
}
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, externalTableSegments, null, partitionInfo, null);
return splits;
}
}
return null;
}
Aggregations